From d4c7189ede50380323892019b101adcf83ae6e5d Mon Sep 17 00:00:00 2001 From: chen jinqian Date: Mon, 6 May 2024 10:40:46 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E5=AE=81=E6=B3=A2=E4=B8=AD?= =?UTF-8?q?=E4=BB=8B=E8=B6=85=E5=B8=82=E6=8F=90=E5=8F=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/crawler.py b/crawler.py index 89088a0..10281cb 100644 --- a/crawler.py +++ b/crawler.py @@ -229,14 +229,14 @@ class Crawler: # 定义要传递进去的关于公告信息类型的数据结构 infoType = [ - {"announcementCode": "110-175885", "announcementType":"采购意向"}, - {"announcementCode": "110-978863", "announcementType":"采购公告"}, - {"announcementCode": "110-943756", "announcementType":"更正公告"}, + #{"announcementCode": "110-175885", "announcementType":"采购意向"}, + #{"announcementCode": "110-978863", "announcementType":"采购公告"}, + #{"announcementCode": "110-943756", "announcementType":"更正公告"}, {"announcementCode": "110-774650", "announcementType":"非政府采购公告"}, - {"announcementCode": "110-900461", "announcementType":"结果公告"} + #{"announcementCode": "110-900461", "announcementType":"结果公告"} ] for typeParam in infoType: - for page in range(1, 11): + for page in range(1, 70): try: self.CrawlPage_zfcg_czt_zj(page, typeParam) except Exception as e: @@ -255,6 +255,7 @@ class Crawler: except Exception as e: print('4--------------------------------', e) + # 爬取宁波市中介超市网的信息 print('开始获取宁波市中介超市网的信息\n') infoType = [ @@ -384,7 +385,7 @@ class Crawler: print(url, title) announcementType = typeParam['announcementType'] - #print(title, url, region, publishDate, announcementType) + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) @@ -431,6 +432,7 @@ class Crawler: region = '浙江国际招标' publishDate = item.xpath('//p')[1].text announcementType = typeParam['announcementType'] + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) @@ -468,6 +470,7 @@ class Crawler: region = '宁波名诚招标' publishDate = item.xpath('//a/div[4]')[0].text announcementType = typeParam['announcementType'] + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) @@ -512,10 +515,9 @@ class Crawler: region = '宁波国际招标' publishDate = item['addtime'] announcementType = item['stage'] + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) - print(publishDate, title, url) - # 宁波市国资委属企业招标信息网 def CrawlPage_gzw_ningbo(self, page): # 这个方法是实际爬取指定页面的信息。 @@ -548,6 +550,7 @@ class Crawler: region = '宁波市属国企' publishDate = item.xpath('//p')[0].text announcementType = '采购公告' + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) @@ -558,7 +561,7 @@ class Crawler: # 这个方法是实际爬取指定页面的信息。 # type 用于判别采购信息的类型 session = HTMLSession() - urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList'] + urllist = ['https://zjcs.zwb.ningbo.gov.cn/siteapi/api/Portal/GetBulletinInfoList'] headers = { "Accept": "application/json, text/javascript, */*; q=0.01", @@ -571,7 +574,6 @@ class Crawler: payload = { "page": page, "pageSize": 10, - "center_id": "", "bulletin_type_id": typeParam["announcementCode"] } @@ -579,6 +581,7 @@ class Crawler: r = session.post(url = url, headers = headers, json = payload) if r.status_code != 200: + print("error") if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) return False @@ -591,15 +594,14 @@ class Crawler: for item in data: articleId = item['auto_id'] BulletinTypeId = typeParam["announcementCode"] - url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId) + url = 'https://zjcs.zwb.ningbo.gov.cn/gDetails?id=%s' % (articleId) title = item['bulletin_title'] region = '宁波中介超市' publishDate = item['publish_date'].replace('T', ' ') announcementType = typeParam['announcementType'] + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) - #print(publishDate, title, url) - # 宁波阳光采购网 def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode'] @@ -632,7 +634,7 @@ class Crawler: region = '宁波阳光采购' publishDate = item.xpath('//div[2]')[0].text announcementType = typeParam['announcementType'] - print(title) + print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) @@ -714,6 +716,7 @@ class Crawler: else: title = item['title'] region = item['districtName'] + print(publishDate, url, page, title) self.write_information(title, pageUrl, region, publishDate, announcementType) #print(publishDate, url)