diff --git a/crawler.py b/crawler.py index 8018b22..8be7f2b 100644 --- a/crawler.py +++ b/crawler.py @@ -257,8 +257,11 @@ class Crawler: # 爬取宁波市中介超市网的信息 print('开始获取宁波市中介超市网的信息\n') infoType = [ - {"announcementCode": '1', "announcementType":"项目需求公告"}, - {"announcementCode": '2', "announcementType":"结果公告"} + {"announcementCode": '10', "announcementType":"业务需求公告"}, + {"announcementCode": '11', "announcementType":"业务需求补充公告"}, + {"announcementCode": '20', "announcementType":"中选结果公告"}, + {"announcementCode": '21', "announcementType":"中选结果补充公告"}, + {"announcementCode": '22', "announcementType":"中选结果补充公告"} ] for typeParam in infoType: @@ -268,7 +271,7 @@ class Crawler: except Exception as e: print('5------------------------------', e) - # 爬取宁波市国资委市属企业采购信息 + # 爬取宁波市国资委市属企业采购信息 print('开始获取宁波市国资委市属企业招投标网的信息\n') for page in range(1, 5): try: @@ -554,7 +557,7 @@ class Crawler: # 这个方法是实际爬取指定页面的信息。 # type 用于判别采购信息的类型 session = HTMLSession() - urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15'] + urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList'] headers = { "Accept": "application/json, text/javascript, */*; q=0.01", @@ -562,36 +565,39 @@ class Crawler: "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "DNT": '1', - "Host": "ygcg.nbcqjy.org", - "Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type), "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } + payload = { + "page": page, + "pageSize": 10, + "center_id": "", + "bulletin_type_id": typeParam["announcementCode"] + } for url in urllist: - r = session.get(url = url, headers = headers) + r = session.post(url = url, headers = headers, json = payload) if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) return False - data = json.loads(r.text)['data'] + data = json.loads(r.text)['body'] total = data['total'] - data = data['rows'] + data = data['data']['bulletinInfoList'] for item in data: - articleId = item['AutoId'] - BulletinTypeId = item['BulletinTypeId'] - url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId) - title = item['BulletinTitle'] + articleId = item['auto_id'] + BulletinTypeId = typeParam["announcementCode"] + url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId) + title = item['bulletin_title'] region = '宁波中介超市' - publishDate = item['PublishDate'] + publishDate = item['publish_date'].replace('T', ' ') announcementType = typeParam['announcementType'] self.write_information(title, url, region, publishDate, announcementType) - #print(publishDate, url) - + #print(publishDate, title, url) # 宁波阳光采购网 def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): diff --git a/sql/isfiltered.sql b/sql/isfiltered.sql new file mode 100644 index 0000000..5b22e1e --- /dev/null +++ b/sql/isfiltered.sql @@ -0,0 +1,22 @@ +CREATE DEFINER=`root`@`%` FUNCTION `isfiltered`(field varchar(255)) RETURNS tinyint +BEGIN + set @keywords = ""; + set @word = ""; + SELECT trim(value) into @keywords FROM guoyan.sysconfigure where fieldname = 'PresaleKeyword'; + set @keywords = replace(replace(@keywords, ' ', ''), ',', ','); + set @keywords = concat(@keywords, ','); + + while length(@keywords) > 0 do + select SUBSTRING_INDEX(@keywords, ',', 1) into @word; + + if findbykeyword(field, @word) then + return true; + end if; + + set @keywords = replace(@keywords, concat(@word, ','), ''); + + end while; + + return false; + +END