Browse Source

更新宁波中介超市提取代码

master
chen jinqian 5 months ago
parent
commit
9377a0a5c1
  1. 38
      crawler.py
  2. 22
      sql/isfiltered.sql

38
crawler.py

@ -257,8 +257,11 @@ class Crawler:
# 爬取宁波市中介超市网的信息 # 爬取宁波市中介超市网的信息
print('开始获取宁波市中介超市网的信息\n') print('开始获取宁波市中介超市网的信息\n')
infoType = [ infoType = [
{"announcementCode": '1', "announcementType":"项目需求公告"}, {"announcementCode": '10', "announcementType":"业务需求公告"},
{"announcementCode": '2', "announcementType":"结果公告"} {"announcementCode": '11', "announcementType":"业务需求补充公告"},
{"announcementCode": '20', "announcementType":"中选结果公告"},
{"announcementCode": '21', "announcementType":"中选结果补充公告"},
{"announcementCode": '22', "announcementType":"中选结果补充公告"}
] ]
for typeParam in infoType: for typeParam in infoType:
@ -268,7 +271,7 @@ class Crawler:
except Exception as e: except Exception as e:
print('5------------------------------', e) print('5------------------------------', e)
# 爬取宁波市国资委市属企业采购信息 # 爬取宁波市国资委市属企业采购信息
print('开始获取宁波市国资委市属企业招投标网的信息\n') print('开始获取宁波市国资委市属企业招投标网的信息\n')
for page in range(1, 5): for page in range(1, 5):
try: try:
@ -554,7 +557,7 @@ class Crawler:
# 这个方法是实际爬取指定页面的信息。 # 这个方法是实际爬取指定页面的信息。
# type 用于判别采购信息的类型 # type 用于判别采购信息的类型
session = HTMLSession() session = HTMLSession()
urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15'] urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList']
headers = { headers = {
"Accept": "application/json, text/javascript, */*; q=0.01", "Accept": "application/json, text/javascript, */*; q=0.01",
@ -562,36 +565,39 @@ class Crawler:
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive", "Connection": "keep-alive",
"DNT": '1', "DNT": '1',
"Host": "ygcg.nbcqjy.org",
"Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
} }
payload = {
"page": page,
"pageSize": 10,
"center_id": "",
"bulletin_type_id": typeParam["announcementCode"]
}
for url in urllist: for url in urllist:
r = session.get(url = url, headers = headers) r = session.post(url = url, headers = headers, json = payload)
if r.status_code != 200: if r.status_code != 200:
if page == 1: if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
return False return False
data = json.loads(r.text)['data'] data = json.loads(r.text)['body']
total = data['total'] total = data['total']
data = data['rows'] data = data['data']['bulletinInfoList']
for item in data: for item in data:
articleId = item['AutoId'] articleId = item['auto_id']
BulletinTypeId = item['BulletinTypeId'] BulletinTypeId = typeParam["announcementCode"]
url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId) url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId)
title = item['BulletinTitle'] title = item['bulletin_title']
region = '宁波中介超市' region = '宁波中介超市'
publishDate = item['PublishDate'] publishDate = item['publish_date'].replace('T', ' ')
announcementType = typeParam['announcementType'] announcementType = typeParam['announcementType']
self.write_information(title, url, region, publishDate, announcementType) self.write_information(title, url, region, publishDate, announcementType)
#print(publishDate, url) #print(publishDate, title, url)
# 宁波阳光采购网 # 宁波阳光采购网
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):

22
sql/isfiltered.sql

@ -0,0 +1,22 @@
CREATE DEFINER=`root`@`%` FUNCTION `isfiltered`(field varchar(255)) RETURNS tinyint
BEGIN
set @keywords = "";
set @word = "";
SELECT trim(value) into @keywords FROM guoyan.sysconfigure where fieldname = 'PresaleKeyword';
set @keywords = replace(replace(@keywords, ' ', ''), '', ',');
set @keywords = concat(@keywords, ',');
while length(@keywords) > 0 do
select SUBSTRING_INDEX(@keywords, ',', 1) into @word;
if findbykeyword(field, @word) then
return true;
end if;
set @keywords = replace(@keywords, concat(@word, ','), '');
end while;
return false;
END
Loading…
Cancel
Save