售前信息平台
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

725 lines
31 KiB

#!/usr/bin/python3
"""
===========================================================================================
文件名称:crawler.py
这是一个用于爬取采购信息的模块
要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表
===========================================================================================
class Crawler:
def __init__(self, connect):
def generate_id(self):
def write_log_information(self, data_id, catalog_name):
def CrawlPage_gzw_ningbo(self, page): # 宁波国资委市属国企招标投标信息
def CrawlPage_zjcs_nbxzfw(self, type, page): # 宁波市中介超市
def CrawlPage_ygcg_nbcqjy_org(self, page): # 宁波市阳光采购
def CrawlPage_zfcg_czt_zj(self, page): # 浙江政府采购网
def CrawlPage_cbbidding(self, page): # 宁波中基国际招标有限公司
def CrawlPage_zmeetb(self, page): # 浙江国际招标有限公司
def CrawlPage_nbbidding(self, page): # 宁波国际招标有限公司
============================================================================================
"""
import datetime
import hashlib
import pymysql
import json
import random
from requests_html import HTMLSession
from requests_html import HTML, UserAgent
import gymailer
import time
'''
============================================================
这个类用来封装splash服务
其中:
self.splash_ip 参数是splash服务的ip
============================================================
'''
class Splash:
def __init__(self):
self.splash_ip = '127.0.0.1'
'''
============================================================
wait_for参数用来制定需要等待的元素,只有该元素渲染完成,程序才能染回,否则将等待200秒。wait_for 参数采购选择器的方式,如
如制定元素id, 采用“#app"形式,如制定元素class, 采用 '.class-name'形式。
============================================================
'''
def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}):
lua_scripts = """
function wait_for_element(splash, css, maxwait)
-- Wait until a selector matches an element
-- in the page. Return an error if waited more
-- than maxwait seconds.
if maxwait == nil then
maxwait = 10
end
return splash:wait_for_resume(string.format([[
function main(splash) {
var selector = '%s';
var maxwait = %s;
var end = Date.now() + maxwait*1000;
function check() {
if(document.querySelector(selector)) {
splash.resume('Element found');
} else if(Date.now() >= end) {
var err = 'Timeout waiting for element';
splash.error(err + " " + selector);
} else {
setTimeout(check, 200);
}
}
check();
}
]], css, maxwait))
end
function main(splash, args)
pages = """ + str(pages) + """
page_element = '""" + page_element + """'
wait_for = '""" + wait_for + """'
splash:go('""" + url + """')
wait_for_element(splash, wait_for)
wait_for_element(splash, page_element)
-- 将第一页的结果加入返回结果集中
results = {splash.html()}
if pages == 1 then
return results
else
-- 执行翻页动作
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页
for i = 2, pages do
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件
js = string.format("document.querySelector('%s').click();", page_element)
-- 执行翻页脚本
splash:runjs(js)
-- 等待页面加载完成
wait_for_element(splash, wait_for)
wait_for_element(splash, page_element)
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新
assert(splash:wait(5))
-- 将页面加入返回结果集中
table.insert(results, splash.html())
end
return results
end
end
"""
splash_url = 'http://' + self.splash_ip + ':8050/execute'
data = json.dumps({'lua_source':lua_scripts})
r = HTMLSession().post(splash_url, headers=headers, data=data)
return r
class Crawler:
def __init__(self, connect):
self.connect = connect
def generate_id(self):
# 用于生成一个32位的ID号
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000))
md5_hash = hashlib.md5()
md5_hash.update(current_time.encode('utf-8'))
return md5_hash.hexdigest()
def write_log_information(self, data_id, catalog_name, log_type='采购公告'):
# 添加了一条信息,需要同步更新其他相关信息, 包含对话框信息和日志信息两项
with self.connect.cursor() as cursor:
affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type))
if affected_row == 0:
return False
result = cursor.fetchall()
catalog_id = result[0][0]
catalogdata_id = self.generate_id()
readlog_id = self.generate_id()
affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'")
if affected_row == 0:
return False
result = cursor.fetchall()
staff_id = result[0][0]
add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
affected_row = cursor.execute(
'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)',
(catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0))
cursor.execute(
'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)',
(readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id))
return True
def write_information(self, title, url, region, publishTime, announcementType):
# 用于将一条信息写入数据库中
with self.connect.cursor() as cursor:
cggg_id = self.generate_id()
try:
title = title.replace("'", "\\\'")
affected_rows = cursor.execute(
'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)',
(cggg_id, title, url, region, publishTime, announcementType))
except pymysql.err.IntegrityError:
print('信息重复')
self.connect.rollback()
return False
else:
if self.write_log_information(cggg_id, announcementType):
self.connect.commit()
else:
print('添加采购信息失败')
self.connect.rollback()
return False
return True
def write_information_cgyx(self, cgyx):
# 用于将一条信息写入数据库中
with self.connect.cursor() as cursor:
cgyx_id = self.generate_id()
cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'")
strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')'
try:
affected_rows = cursor.execute(strSql)
except pymysql.err.IntegrityError:
print('信息重复')
#self.connect.rollback()
return False
else:
if self.write_log_information(cgyx_id, '采购意向'):
self.connect.commit()
else:
print('添加采购信息失败')
self.connect.rollback()
return False
return True
def Check(self):
with self.connect.cursor() as cursor:
affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);")
if affected_row == 0:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常,请检查!')
return False
else:
return True
def Crawl(self):
# 这个方法是实际完成爬取工作的总入口。
# 爬取浙江政采网的信息
print('开始获取浙江政采网的信息\n')
# 定义要传递进去的关于公告信息类型的数据结构
infoType = [
#{"announcementCode": "110-175885", "announcementType":"采购意向"},
#{"announcementCode": "110-978863", "announcementType":"采购公告"},
#{"announcementCode": "110-943756", "announcementType":"更正公告"},
{"announcementCode": "110-774650", "announcementType":"非政府采购公告"},
#{"announcementCode": "110-900461", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 70):
try:
self.CrawlPage_zfcg_czt_zj(page, typeParam)
except Exception as e:
print('3--------------------------------', e)
# 爬取宁波市阳光采购网的信息
print('开始获取宁波市阳光采购网的信息\n')
infoType = [
{"announcementCode": "21", "announcementType":"采购公告"},
{"announcementCode": "23", "announcementType":"更正公告"},
{"announcementCode": "22", "announcementType":"结果公告"}
]
for typeParam in infoType:
try:
self.CrawlPage_ygcg_nbcqjy_org(2, typeParam)
except Exception as e:
print('4--------------------------------', e)
# 爬取宁波市中介超市网的信息
print('开始获取宁波市中介超市网的信息\n')
infoType = [
{"announcementCode": '10', "announcementType":"业务需求公告"},
{"announcementCode": '11', "announcementType":"业务需求补充公告"},
{"announcementCode": '20', "announcementType":"中选结果公告"},
{"announcementCode": '21', "announcementType":"中选结果补充公告"},
{"announcementCode": '22', "announcementType":"中选结果补充公告"}
]
for typeParam in infoType:
for page in range(1, 6):
try:
self.CrawlPage_zjcs_nbxzfw(page, typeParam)
except Exception as e:
print('5------------------------------', e)
# 爬取宁波市国资委市属企业采购信息
print('开始获取宁波市国资委市属企业招投标网的信息\n')
for page in range(1, 5):
try:
self.CrawlPage_gzw_ningbo(page)
except Exception as e:
print('6------------------------------', e)
# 爬取宁波中基国际招标网的信息
print('开始获取宁波中基国际招标网的信息\n')
infoType = [
{"announcementCode": "22", "announcementType":"采购公告"},
{"announcementCode": "23", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 6):
try:
self.CrawlPage_cbbidding(page, typeParam)
except Exception as e:
print('7--------------------------------', e)
# 爬取浙江国际招标网的信息
print('开始获取浙江国际招标网的信息\n')
infoType = [
{"announcementCode": "Zbgg", "announcementType":"采购公告"},
{"announcementCode": "Gzgg", "announcementType":"更正公告"},
{"announcementCode": "jggg", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 5):
try:
self.CrawlPage_zmeetb(page, typeParam)
except Exception as e:
print('8----------------------------', e)
# 爬取宁波市国际招标有限公司网站
print('开始获取宁波国际招标网的信息\n')
# 定义要传递进去的关于公告信息类型的数据结构
infoType = [
{"announcementCode": "1", "announcementType":"采购公告"},
{"announcementCode": "1", "announcementType":"结果公告"},
{"announcementCode": "2", "announcementType":"采购公告"},
{"announcementCode": "2", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 5):
try:
self.CrawlPage_nbbidding(page, typeParam)
except Exception as e:
print('9--------------------------------', e)
# 爬取宁波名诚招标代理有限公司网站
print('开始获取宁波名城招标的信息\n')
# 定义要传递进去的关于公告信息类型的数据结构
infoType = [
{"announcementCode": "99", "announcementType":"采购公告"},
{"announcementCode": "88", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 2):
try:
self.CrawlPage_nbmcbidding(page, typeParam)
except Exception as e:
print('10--------------------------------', e)
# 宁波中基国际招标有限公司 https://www.cbbidding.com/
def CrawlPage_cbbidding(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
session.DEFAULT_RETRIES = 5
url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page)
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
# 这个网站返回的是一个网页,所以需要进行网页解析
r = session.get(url = url, headers = headers)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li')
for item in data:
title = item.xpath('//a')[0].text
url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href')
region = '中基招标'
publishDate = item.xpath('//div')[0].text
try:
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
except Exception as e:
publishDate = publishDate.replace('.', '-')
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
print(url, title)
announcementType = typeParam['announcementType']
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 浙江国际招投标有限公司 https://www.zmeetb.com/
def CrawlPage_zmeetb(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/' + str(page) + '.html'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age=0",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "close",
"DNT": '1',
"Host": "www.zmeetb.com",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
# 这个网站返回的是一个网页,所以需要进行网页解析
# 这个网站如果使用render()函数,会遇到ssl证书问题,需要进一步研究chromium浏览器的证书问题
#r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com')
r = session.get(url = url, headers = headers, verify=False)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a')
for item in data:
title = item.xpath('//p')[0].text
url = item.attrs.get('href')
region = '浙江国际招标'
publishDate = item.xpath('//p')[1].text
announcementType = typeParam['announcementType']
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/
def CrawlPage_nbmcbidding(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
if typeParam['announcementType'] == '采购公告':
url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/"
else:
url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/"
data = {}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Host": "www.nbmcbidding.com",
'Connection': 'keep-alive',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
r = session.get(url = url, headers = headers, json = data)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li')
for item in data:
title = item.xpath('//a/div[2]')[0].text
url = item.xpath('//a')[0].attrs.get('href')
region = '宁波名诚招标'
publishDate = item.xpath('//a/div[4]')[0].text
announcementType = typeParam['announcementType']
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市国际招标有限公司 http://www.nbbidding.com/
def CrawlPage_nbbidding(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
if typeParam['announcementType'] == '采购公告':
url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword"
else:
url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword"
data = {}
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Host": "www.nbbidding.com",
'Connection': 'keep-alive',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
r = session.get(url = url, headers = headers, json = data)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text)
return False
data = json.loads(r.text)['data']
total = data['page']['count']
data = data['list']
for item in data:
id = item['id']
if typeParam['announcementType'] == '采购公告':
url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id)
else:
url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id)
title = item['title']
region = '宁波国际招标'
publishDate = item['addtime']
announcementType = item['stage']
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市国资委属企业招标信息网
def CrawlPage_gzw_ningbo(self, page):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page)
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
# 这个网站返回的是一个网页,所以需要进行网页解析
r = session.get(url = url, headers = headers)
r.html.render()
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li')
for item in data:
title = item.xpath('//a')[0].text
url = item.xpath('//a')[0].attrs.get('href')
region = '宁波市属国企'
publishDate = item.xpath('//p')[0].text
announcementType = '采购公告'
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市中介超市网
# 2024-03-29 更新
def CrawlPage_zjcs_nbxzfw(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
# type 用于判别采购信息的类型
session = HTMLSession()
urllist = ['https://zjcs.zwb.ningbo.gov.cn/siteapi/api/Portal/GetBulletinInfoList']
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
payload = {
"page": page,
"pageSize": 10,
"bulletin_type_id": typeParam["announcementCode"]
}
for url in urllist:
r = session.post(url = url, headers = headers, json = payload)
if r.status_code != 200:
print("error")
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
return False
data = json.loads(r.text)['body']
total = data['total']
data = data['data']['bulletinInfoList']
for item in data:
articleId = item['auto_id']
BulletinTypeId = typeParam["announcementCode"]
url = 'https://zjcs.zwb.ningbo.gov.cn/gDetails?id=%s' % (articleId)
title = item['bulletin_title']
region = '宁波中介超市'
publishDate = item['publish_date'].replace('T', ' ')
announcementType = typeParam['announcementType']
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 宁波阳光采购网
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType=' + typeParam['announcementCode']
wait_for = '.ant-pagination-item-ellipsis'
page_element = '.anticon-right'
try:
r = Splash().post(url, wait_for, pages=pages, page_element=page_element)
except Exception as e:
print(e)
results = json.loads(r.text)
# 这个方法是实际爬取指定页面的信息。
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码:'+str(r.status_code), r.text)
return False
for i in range(1, pages + 1):
data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li')
if len(data) == 0:
print('数据为空')
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e)
return False
for item in data:
url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href')
title = item.xpath('//a/span[3]')[0].text
region = '宁波阳光采购'
publishDate = item.xpath('//div[2]')[0].text
announcementType = typeParam['announcementType']
print(publishDate, url, page, title)
self.write_information(title, url, region, publishDate, announcementType)
# 浙江政府采购网
def CrawlPage_zfcg_czt_zj(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
url = 'https://zfcg.czt.zj.gov.cn/portal/category'
if typeParam['announcementCode'] == '110-420383':
data = {
"pageNo": page,
"pageSize": 15,
"categoryCode": typeParam['announcementCode'],
"districtCode": ["339900"],
"isProvince": True,
"includeGovDistrict": "1",
"_t": 1699104836000
}
else:
data = {
"pageNo": page,
"pageSize": 15,
"categoryCode": typeParam['announcementCode'],
"isGov": True,
"excludeDistrictPrefix": "90",
"_t": 1699104836000
}
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"content-type": "application/json;charset=UTF-8",
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-requested-with": "XMLHttpRequest"
}
try:
r = session.post(url = url, headers = headers, json = data)
except Exception as e:
print('10-------------------------', e)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text)
return False
data = json.loads(r.text)['result']['data']
total = data['total']
data = data['data']
for item in data:
publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000)
pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c'
detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId']
announcementType = typeParam['announcementType']
if announcementType == '采购意向':
r = session.get(url = detailUrl, headers = headers)
detailData = json.loads(r.text)['result']['data']
if detailData == None:
break
content = HTML(html='<xml>'+detailData['content']+'</xml>')
region = item['districtName']
for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'):
title = detailItem.xpath('//td[2]')[0].text
cgxqqk = detailItem.xpath('//td[3]')[0].text
ysje = detailItem.xpath('//td[4]')[0].text
yjcgsj = detailItem.xpath('//td[5]')[0].text
ly = detailData["title"]
self.write_information(title, pageUrl, region, publishDate, announcementType)
self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly})
else:
title = item['title']
region = item['districtName']
print(publishDate, url, page, title)
self.write_information(title, pageUrl, region, publishDate, announcementType)
#print(publishDate, url)
return True