|
@ -257,8 +257,11 @@ class Crawler: |
|
|
# 爬取宁波市中介超市网的信息 |
|
|
# 爬取宁波市中介超市网的信息 |
|
|
print('开始获取宁波市中介超市网的信息\n') |
|
|
print('开始获取宁波市中介超市网的信息\n') |
|
|
infoType = [ |
|
|
infoType = [ |
|
|
{"announcementCode": '1', "announcementType":"项目需求公告"}, |
|
|
{"announcementCode": '10', "announcementType":"业务需求公告"}, |
|
|
{"announcementCode": '2', "announcementType":"结果公告"} |
|
|
{"announcementCode": '11', "announcementType":"业务需求补充公告"}, |
|
|
|
|
|
{"announcementCode": '20', "announcementType":"中选结果公告"}, |
|
|
|
|
|
{"announcementCode": '21', "announcementType":"中选结果补充公告"}, |
|
|
|
|
|
{"announcementCode": '22', "announcementType":"中选结果补充公告"} |
|
|
] |
|
|
] |
|
|
|
|
|
|
|
|
for typeParam in infoType: |
|
|
for typeParam in infoType: |
|
@ -268,7 +271,7 @@ class Crawler: |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print('5------------------------------', e) |
|
|
print('5------------------------------', e) |
|
|
|
|
|
|
|
|
# 爬取宁波市国资委市属企业采购信息 |
|
|
# 爬取宁波市国资委市属企业采购信息 |
|
|
print('开始获取宁波市国资委市属企业招投标网的信息\n') |
|
|
print('开始获取宁波市国资委市属企业招投标网的信息\n') |
|
|
for page in range(1, 5): |
|
|
for page in range(1, 5): |
|
|
try: |
|
|
try: |
|
@ -554,7 +557,7 @@ class Crawler: |
|
|
# 这个方法是实际爬取指定页面的信息。 |
|
|
# 这个方法是实际爬取指定页面的信息。 |
|
|
# type 用于判别采购信息的类型 |
|
|
# type 用于判别采购信息的类型 |
|
|
session = HTMLSession() |
|
|
session = HTMLSession() |
|
|
urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15'] |
|
|
urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList'] |
|
|
|
|
|
|
|
|
headers = { |
|
|
headers = { |
|
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
|
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
|
@ -562,36 +565,39 @@ class Crawler: |
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|
|
"Connection": "keep-alive", |
|
|
"Connection": "keep-alive", |
|
|
"DNT": '1', |
|
|
"DNT": '1', |
|
|
"Host": "ygcg.nbcqjy.org", |
|
|
|
|
|
"Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type), |
|
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|
|
} |
|
|
} |
|
|
|
|
|
payload = { |
|
|
|
|
|
"page": page, |
|
|
|
|
|
"pageSize": 10, |
|
|
|
|
|
"center_id": "", |
|
|
|
|
|
"bulletin_type_id": typeParam["announcementCode"] |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
for url in urllist: |
|
|
for url in urllist: |
|
|
r = session.get(url = url, headers = headers) |
|
|
r = session.post(url = url, headers = headers, json = payload) |
|
|
|
|
|
|
|
|
if r.status_code != 200: |
|
|
if r.status_code != 200: |
|
|
if page == 1: |
|
|
if page == 1: |
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) |
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) |
|
|
return False |
|
|
return False |
|
|
|
|
|
|
|
|
data = json.loads(r.text)['data'] |
|
|
data = json.loads(r.text)['body'] |
|
|
|
|
|
|
|
|
total = data['total'] |
|
|
total = data['total'] |
|
|
data = data['rows'] |
|
|
data = data['data']['bulletinInfoList'] |
|
|
|
|
|
|
|
|
for item in data: |
|
|
for item in data: |
|
|
articleId = item['AutoId'] |
|
|
articleId = item['auto_id'] |
|
|
BulletinTypeId = item['BulletinTypeId'] |
|
|
BulletinTypeId = typeParam["announcementCode"] |
|
|
url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId) |
|
|
url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId) |
|
|
title = item['BulletinTitle'] |
|
|
title = item['bulletin_title'] |
|
|
region = '宁波中介超市' |
|
|
region = '宁波中介超市' |
|
|
publishDate = item['PublishDate'] |
|
|
publishDate = item['publish_date'].replace('T', ' ') |
|
|
announcementType = typeParam['announcementType'] |
|
|
announcementType = typeParam['announcementType'] |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
#print(publishDate, url) |
|
|
#print(publishDate, title, url) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 宁波阳光采购网 |
|
|
# 宁波阳光采购网 |
|
|
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): |
|
|
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): |
|
|