|
@ -229,14 +229,14 @@ class Crawler: |
|
|
|
|
|
|
|
|
# 定义要传递进去的关于公告信息类型的数据结构 |
|
|
# 定义要传递进去的关于公告信息类型的数据结构 |
|
|
infoType = [ |
|
|
infoType = [ |
|
|
{"announcementCode": "110-175885", "announcementType":"采购意向"}, |
|
|
#{"announcementCode": "110-175885", "announcementType":"采购意向"}, |
|
|
{"announcementCode": "110-978863", "announcementType":"采购公告"}, |
|
|
#{"announcementCode": "110-978863", "announcementType":"采购公告"}, |
|
|
{"announcementCode": "110-943756", "announcementType":"更正公告"}, |
|
|
#{"announcementCode": "110-943756", "announcementType":"更正公告"}, |
|
|
{"announcementCode": "110-774650", "announcementType":"非政府采购公告"}, |
|
|
{"announcementCode": "110-774650", "announcementType":"非政府采购公告"}, |
|
|
{"announcementCode": "110-900461", "announcementType":"结果公告"} |
|
|
#{"announcementCode": "110-900461", "announcementType":"结果公告"} |
|
|
] |
|
|
] |
|
|
for typeParam in infoType: |
|
|
for typeParam in infoType: |
|
|
for page in range(1, 11): |
|
|
for page in range(1, 70): |
|
|
try: |
|
|
try: |
|
|
self.CrawlPage_zfcg_czt_zj(page, typeParam) |
|
|
self.CrawlPage_zfcg_czt_zj(page, typeParam) |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
@ -255,6 +255,7 @@ class Crawler: |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print('4--------------------------------', e) |
|
|
print('4--------------------------------', e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取宁波市中介超市网的信息 |
|
|
# 爬取宁波市中介超市网的信息 |
|
|
print('开始获取宁波市中介超市网的信息\n') |
|
|
print('开始获取宁波市中介超市网的信息\n') |
|
|
infoType = [ |
|
|
infoType = [ |
|
@ -384,7 +385,7 @@ class Crawler: |
|
|
|
|
|
|
|
|
print(url, title) |
|
|
print(url, title) |
|
|
announcementType = typeParam['announcementType'] |
|
|
announcementType = typeParam['announcementType'] |
|
|
#print(title, url, region, publishDate, announcementType) |
|
|
print(publishDate, url, page, title) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -431,6 +432,7 @@ class Crawler: |
|
|
region = '浙江国际招标' |
|
|
region = '浙江国际招标' |
|
|
publishDate = item.xpath('//p')[1].text |
|
|
publishDate = item.xpath('//p')[1].text |
|
|
announcementType = typeParam['announcementType'] |
|
|
announcementType = typeParam['announcementType'] |
|
|
|
|
|
print(publishDate, url, page, title) |
|
|
|
|
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
@ -468,6 +470,7 @@ class Crawler: |
|
|
region = '宁波名诚招标' |
|
|
region = '宁波名诚招标' |
|
|
publishDate = item.xpath('//a/div[4]')[0].text |
|
|
publishDate = item.xpath('//a/div[4]')[0].text |
|
|
announcementType = typeParam['announcementType'] |
|
|
announcementType = typeParam['announcementType'] |
|
|
|
|
|
print(publishDate, url, page, title) |
|
|
|
|
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
@ -512,10 +515,9 @@ class Crawler: |
|
|
region = '宁波国际招标' |
|
|
region = '宁波国际招标' |
|
|
publishDate = item['addtime'] |
|
|
publishDate = item['addtime'] |
|
|
announcementType = item['stage'] |
|
|
announcementType = item['stage'] |
|
|
|
|
|
print(publishDate, url, page, title) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
print(publishDate, title, url) |
|
|
|
|
|
|
|
|
|
|
|
# 宁波市国资委属企业招标信息网 |
|
|
# 宁波市国资委属企业招标信息网 |
|
|
def CrawlPage_gzw_ningbo(self, page): |
|
|
def CrawlPage_gzw_ningbo(self, page): |
|
|
# 这个方法是实际爬取指定页面的信息。 |
|
|
# 这个方法是实际爬取指定页面的信息。 |
|
@ -548,6 +550,7 @@ class Crawler: |
|
|
region = '宁波市属国企' |
|
|
region = '宁波市属国企' |
|
|
publishDate = item.xpath('//p')[0].text |
|
|
publishDate = item.xpath('//p')[0].text |
|
|
announcementType = '采购公告' |
|
|
announcementType = '采购公告' |
|
|
|
|
|
print(publishDate, url, page, title) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -558,7 +561,7 @@ class Crawler: |
|
|
# 这个方法是实际爬取指定页面的信息。 |
|
|
# 这个方法是实际爬取指定页面的信息。 |
|
|
# type 用于判别采购信息的类型 |
|
|
# type 用于判别采购信息的类型 |
|
|
session = HTMLSession() |
|
|
session = HTMLSession() |
|
|
urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList'] |
|
|
urllist = ['https://zjcs.zwb.ningbo.gov.cn/siteapi/api/Portal/GetBulletinInfoList'] |
|
|
|
|
|
|
|
|
headers = { |
|
|
headers = { |
|
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
|
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
|
@ -571,7 +574,6 @@ class Crawler: |
|
|
payload = { |
|
|
payload = { |
|
|
"page": page, |
|
|
"page": page, |
|
|
"pageSize": 10, |
|
|
"pageSize": 10, |
|
|
"center_id": "", |
|
|
|
|
|
"bulletin_type_id": typeParam["announcementCode"] |
|
|
"bulletin_type_id": typeParam["announcementCode"] |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
@ -579,6 +581,7 @@ class Crawler: |
|
|
r = session.post(url = url, headers = headers, json = payload) |
|
|
r = session.post(url = url, headers = headers, json = payload) |
|
|
|
|
|
|
|
|
if r.status_code != 200: |
|
|
if r.status_code != 200: |
|
|
|
|
|
print("error") |
|
|
if page == 1: |
|
|
if page == 1: |
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) |
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) |
|
|
return False |
|
|
return False |
|
@ -591,15 +594,14 @@ class Crawler: |
|
|
for item in data: |
|
|
for item in data: |
|
|
articleId = item['auto_id'] |
|
|
articleId = item['auto_id'] |
|
|
BulletinTypeId = typeParam["announcementCode"] |
|
|
BulletinTypeId = typeParam["announcementCode"] |
|
|
url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId) |
|
|
url = 'https://zjcs.zwb.ningbo.gov.cn/gDetails?id=%s' % (articleId) |
|
|
title = item['bulletin_title'] |
|
|
title = item['bulletin_title'] |
|
|
region = '宁波中介超市' |
|
|
region = '宁波中介超市' |
|
|
publishDate = item['publish_date'].replace('T', ' ') |
|
|
publishDate = item['publish_date'].replace('T', ' ') |
|
|
announcementType = typeParam['announcementType'] |
|
|
announcementType = typeParam['announcementType'] |
|
|
|
|
|
print(publishDate, url, page, title) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
#print(publishDate, title, url) |
|
|
|
|
|
|
|
|
|
|
|
# 宁波阳光采购网 |
|
|
# 宁波阳光采购网 |
|
|
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): |
|
|
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): |
|
|
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode'] |
|
|
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode'] |
|
@ -632,7 +634,7 @@ class Crawler: |
|
|
region = '宁波阳光采购' |
|
|
region = '宁波阳光采购' |
|
|
publishDate = item.xpath('//div[2]')[0].text |
|
|
publishDate = item.xpath('//div[2]')[0].text |
|
|
announcementType = typeParam['announcementType'] |
|
|
announcementType = typeParam['announcementType'] |
|
|
print(title) |
|
|
print(publishDate, url, page, title) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
self.write_information(title, url, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -714,6 +716,7 @@ class Crawler: |
|
|
else: |
|
|
else: |
|
|
title = item['title'] |
|
|
title = item['title'] |
|
|
region = item['districtName'] |
|
|
region = item['districtName'] |
|
|
|
|
|
print(publishDate, url, page, title) |
|
|
self.write_information(title, pageUrl, region, publishDate, announcementType) |
|
|
self.write_information(title, pageUrl, region, publishDate, announcementType) |
|
|
|
|
|
|
|
|
#print(publishDate, url) |
|
|
#print(publishDate, url) |
|
|