Browse Source

更新宁波中介超市提取代码

master
chen jinqian 5 months ago
parent
commit
9377a0a5c1
  1. 36
      crawler.py
  2. 22
      sql/isfiltered.sql

36
crawler.py

@ -257,8 +257,11 @@ class Crawler:
# 爬取宁波市中介超市网的信息
print('开始获取宁波市中介超市网的信息\n')
infoType = [
{"announcementCode": '1', "announcementType":"项目需求公告"},
{"announcementCode": '2', "announcementType":"结果公告"}
{"announcementCode": '10', "announcementType":"业务需求公告"},
{"announcementCode": '11', "announcementType":"业务需求补充公告"},
{"announcementCode": '20', "announcementType":"中选结果公告"},
{"announcementCode": '21', "announcementType":"中选结果补充公告"},
{"announcementCode": '22', "announcementType":"中选结果补充公告"}
]
for typeParam in infoType:
@ -554,7 +557,7 @@ class Crawler:
# 这个方法是实际爬取指定页面的信息。
# type 用于判别采购信息的类型
session = HTMLSession()
urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15']
urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList']
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
@ -562,36 +565,39 @@ class Crawler:
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"Host": "ygcg.nbcqjy.org",
"Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
payload = {
"page": page,
"pageSize": 10,
"center_id": "",
"bulletin_type_id": typeParam["announcementCode"]
}
for url in urllist:
r = session.get(url = url, headers = headers)
r = session.post(url = url, headers = headers, json = payload)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
return False
data = json.loads(r.text)['data']
data = json.loads(r.text)['body']
total = data['total']
data = data['rows']
data = data['data']['bulletinInfoList']
for item in data:
articleId = item['AutoId']
BulletinTypeId = item['BulletinTypeId']
url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId)
title = item['BulletinTitle']
articleId = item['auto_id']
BulletinTypeId = typeParam["announcementCode"]
url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId)
title = item['bulletin_title']
region = '宁波中介超市'
publishDate = item['PublishDate']
publishDate = item['publish_date'].replace('T', ' ')
announcementType = typeParam['announcementType']
self.write_information(title, url, region, publishDate, announcementType)
#print(publishDate, url)
#print(publishDate, title, url)
# 宁波阳光采购网
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):

22
sql/isfiltered.sql

@ -0,0 +1,22 @@
CREATE DEFINER=`root`@`%` FUNCTION `isfiltered`(field varchar(255)) RETURNS tinyint
BEGIN
set @keywords = "";
set @word = "";
SELECT trim(value) into @keywords FROM guoyan.sysconfigure where fieldname = 'PresaleKeyword';
set @keywords = replace(replace(@keywords, ' ', ''), '', ',');
set @keywords = concat(@keywords, ',');
while length(@keywords) > 0 do
select SUBSTRING_INDEX(@keywords, ',', 1) into @word;
if findbykeyword(field, @word) then
return true;
end if;
set @keywords = replace(@keywords, concat(@word, ','), '');
end while;
return false;
END
Loading…
Cancel
Save