|
|
|
#!/usr/bin/python3
|
|
|
|
"""
|
|
|
|
===========================================================================================
|
|
|
|
文件名称:crawler.py
|
|
|
|
这是一个用于爬取采购信息的模块
|
|
|
|
要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表
|
|
|
|
===========================================================================================
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, connect):
|
|
|
|
def generate_id(self):
|
|
|
|
def write_log_information(self, data_id, catalog_name):
|
|
|
|
def CrawlPage_gzw_ningbo(self, page): # 宁波国资委市属国企招标投标信息
|
|
|
|
def CrawlPage_zjcs_nbxzfw(self, type, page): # 宁波市中介超市
|
|
|
|
def CrawlPage_ygcg_nbcqjy_org(self, page): # 宁波市阳光采购
|
|
|
|
def CrawlPage_zfcg_czt_zj(self, page): # 浙江政府采购网
|
|
|
|
def CrawlPage_cbbidding(self, page): # 宁波中基国际招标有限公司
|
|
|
|
def CrawlPage_zmeetb(self, page): # 浙江国际招标有限公司
|
|
|
|
def CrawlPage_nbbidding(self, page): # 宁波国际招标有限公司
|
|
|
|
============================================================================================
|
|
|
|
"""
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
import hashlib
|
|
|
|
import pymysql
|
|
|
|
import json
|
|
|
|
import random
|
|
|
|
from requests_html import HTMLSession
|
|
|
|
from requests_html import HTML, UserAgent
|
|
|
|
import gymailer
|
|
|
|
import time
|
|
|
|
|
|
|
|
'''
|
|
|
|
============================================================
|
|
|
|
这个类用来封装splash服务
|
|
|
|
其中:
|
|
|
|
self.splash_ip 参数是splash服务的ip
|
|
|
|
============================================================
|
|
|
|
'''
|
|
|
|
|
|
|
|
class Splash:
|
|
|
|
def __init__(self):
|
|
|
|
self.splash_ip = '127.0.0.1'
|
|
|
|
|
|
|
|
'''
|
|
|
|
============================================================
|
|
|
|
wait_for参数用来制定需要等待的元素,只有该元素渲染完成,程序才能染回,否则将等待200秒。wait_for 参数采购选择器的方式,如
|
|
|
|
如制定元素id, 采用“#app"形式,如制定元素class, 采用 '.class-name'形式。
|
|
|
|
============================================================
|
|
|
|
'''
|
|
|
|
def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}):
|
|
|
|
lua_scripts = """
|
|
|
|
function wait_for_element(splash, css, maxwait)
|
|
|
|
-- Wait until a selector matches an element
|
|
|
|
-- in the page. Return an error if waited more
|
|
|
|
-- than maxwait seconds.
|
|
|
|
if maxwait == nil then
|
|
|
|
maxwait = 10
|
|
|
|
end
|
|
|
|
return splash:wait_for_resume(string.format([[
|
|
|
|
function main(splash) {
|
|
|
|
var selector = '%s';
|
|
|
|
var maxwait = %s;
|
|
|
|
var end = Date.now() + maxwait*1000;
|
|
|
|
|
|
|
|
function check() {
|
|
|
|
if(document.querySelector(selector)) {
|
|
|
|
splash.resume('Element found');
|
|
|
|
} else if(Date.now() >= end) {
|
|
|
|
var err = 'Timeout waiting for element';
|
|
|
|
splash.error(err + " " + selector);
|
|
|
|
} else {
|
|
|
|
setTimeout(check, 200);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
check();
|
|
|
|
}
|
|
|
|
]], css, maxwait))
|
|
|
|
end
|
|
|
|
|
|
|
|
function main(splash, args)
|
|
|
|
pages = """ + str(pages) + """
|
|
|
|
page_element = '""" + page_element + """'
|
|
|
|
wait_for = '""" + wait_for + """'
|
|
|
|
splash:go('""" + url + """')
|
|
|
|
wait_for_element(splash, wait_for)
|
|
|
|
wait_for_element(splash, page_element)
|
|
|
|
|
|
|
|
-- 将第一页的结果加入返回结果集中
|
|
|
|
results = {splash.html()}
|
|
|
|
|
|
|
|
if pages == 1 then
|
|
|
|
return results
|
|
|
|
else
|
|
|
|
-- 执行翻页动作
|
|
|
|
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页
|
|
|
|
for i = 2, pages do
|
|
|
|
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件
|
|
|
|
js = string.format("document.querySelector('%s').click();", page_element)
|
|
|
|
|
|
|
|
-- 执行翻页脚本
|
|
|
|
splash:runjs(js)
|
|
|
|
|
|
|
|
-- 等待页面加载完成
|
|
|
|
wait_for_element(splash, wait_for)
|
|
|
|
wait_for_element(splash, page_element)
|
|
|
|
|
|
|
|
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新
|
|
|
|
assert(splash:wait(5))
|
|
|
|
|
|
|
|
-- 将页面加入返回结果集中
|
|
|
|
table.insert(results, splash.html())
|
|
|
|
end
|
|
|
|
return results
|
|
|
|
end
|
|
|
|
end
|
|
|
|
"""
|
|
|
|
|
|
|
|
splash_url = 'http://' + self.splash_ip + ':8050/execute'
|
|
|
|
data = json.dumps({'lua_source':lua_scripts})
|
|
|
|
r = HTMLSession().post(splash_url, headers=headers, data=data)
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
|
|
class Crawler:
|
|
|
|
def __init__(self, connect):
|
|
|
|
self.connect = connect
|
|
|
|
|
|
|
|
def generate_id(self):
|
|
|
|
# 用于生成一个32位的ID号
|
|
|
|
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000))
|
|
|
|
md5_hash = hashlib.md5()
|
|
|
|
md5_hash.update(current_time.encode('utf-8'))
|
|
|
|
return md5_hash.hexdigest()
|
|
|
|
|
|
|
|
def write_log_information(self, data_id, catalog_name, log_type='采购公告'):
|
|
|
|
# 添加了一条信息,需要同步更新其他相关信息, 包含对话框信息和日志信息两项
|
|
|
|
with self.connect.cursor() as cursor:
|
|
|
|
affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type))
|
|
|
|
if affected_row == 0:
|
|
|
|
return False
|
|
|
|
|
|
|
|
result = cursor.fetchall()
|
|
|
|
catalog_id = result[0][0]
|
|
|
|
catalogdata_id = self.generate_id()
|
|
|
|
readlog_id = self.generate_id()
|
|
|
|
|
|
|
|
affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'")
|
|
|
|
if affected_row == 0:
|
|
|
|
return False
|
|
|
|
|
|
|
|
result = cursor.fetchall()
|
|
|
|
staff_id = result[0][0]
|
|
|
|
add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
|
affected_row = cursor.execute(
|
|
|
|
'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)',
|
|
|
|
(catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0))
|
|
|
|
|
|
|
|
cursor.execute(
|
|
|
|
'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)',
|
|
|
|
(readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id))
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def write_information(self, title, url, region, publishTime, announcementType):
|
|
|
|
# 用于将一条信息写入数据库中
|
|
|
|
with self.connect.cursor() as cursor:
|
|
|
|
cggg_id = self.generate_id()
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
title = title.replace("'", "\\\'")
|
|
|
|
affected_rows = cursor.execute(
|
|
|
|
'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)',
|
|
|
|
(cggg_id, title, url, region, publishTime, announcementType))
|
|
|
|
except pymysql.err.IntegrityError:
|
|
|
|
print('信息重复')
|
|
|
|
self.connect.rollback()
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
if self.write_log_information(cggg_id, announcementType):
|
|
|
|
self.connect.commit()
|
|
|
|
else:
|
|
|
|
print('添加采购信息失败')
|
|
|
|
self.connect.rollback()
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def write_information_cgyx(self, cgyx):
|
|
|
|
# 用于将一条信息写入数据库中
|
|
|
|
|
|
|
|
with self.connect.cursor() as cursor:
|
|
|
|
cgyx_id = self.generate_id()
|
|
|
|
cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'")
|
|
|
|
strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')'
|
|
|
|
try:
|
|
|
|
affected_rows = cursor.execute(strSql)
|
|
|
|
except pymysql.err.IntegrityError:
|
|
|
|
print('信息重复')
|
|
|
|
#self.connect.rollback()
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
if self.write_log_information(cgyx_id, '采购意向'):
|
|
|
|
self.connect.commit()
|
|
|
|
else:
|
|
|
|
print('添加采购信息失败')
|
|
|
|
self.connect.rollback()
|
|
|
|
return False
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
def Check(self):
|
|
|
|
with self.connect.cursor() as cursor:
|
|
|
|
affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);")
|
|
|
|
if affected_row == 0:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常,请检查!')
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def Crawl(self):
|
|
|
|
# 这个方法是实际完成爬取工作的总入口。
|
|
|
|
|
|
|
|
# 爬取浙江政采网的信息
|
|
|
|
print('开始获取浙江政采网的信息\n')
|
|
|
|
|
|
|
|
# 定义要传递进去的关于公告信息类型的数据结构
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": "110-175885", "announcementType":"采购意向"},
|
|
|
|
{"announcementCode": "110-978863", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "110-943756", "announcementType":"更正公告"},
|
|
|
|
{"announcementCode": "110-774650", "announcementType":"非政府采购公告"},
|
|
|
|
{"announcementCode": "110-900461", "announcementType":"结果公告"}
|
|
|
|
]
|
|
|
|
for typeParam in infoType:
|
|
|
|
for page in range(1, 11):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_zfcg_czt_zj(page, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('3--------------------------------', e)
|
|
|
|
|
|
|
|
# 爬取宁波市阳光采购网的信息
|
|
|
|
print('开始获取宁波市阳光采购网的信息\n')
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": "21", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "23", "announcementType":"更正公告"},
|
|
|
|
{"announcementCode": "22", "announcementType":"结果公告"}
|
|
|
|
]
|
|
|
|
for typeParam in infoType:
|
|
|
|
try:
|
|
|
|
self.CrawlPage_ygcg_nbcqjy_org(2, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('4--------------------------------', e)
|
|
|
|
|
|
|
|
# 爬取宁波市中介超市网的信息
|
|
|
|
print('开始获取宁波市中介超市网的信息\n')
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": '10', "announcementType":"业务需求公告"},
|
|
|
|
{"announcementCode": '11', "announcementType":"业务需求补充公告"},
|
|
|
|
{"announcementCode": '20', "announcementType":"中选结果公告"},
|
|
|
|
{"announcementCode": '21', "announcementType":"中选结果补充公告"},
|
|
|
|
{"announcementCode": '22', "announcementType":"中选结果补充公告"}
|
|
|
|
]
|
|
|
|
|
|
|
|
for typeParam in infoType:
|
|
|
|
for page in range(1, 6):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_zjcs_nbxzfw(page, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('5------------------------------', e)
|
|
|
|
|
|
|
|
# 爬取宁波市国资委市属企业采购信息
|
|
|
|
print('开始获取宁波市国资委市属企业招投标网的信息\n')
|
|
|
|
for page in range(1, 5):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_gzw_ningbo(page)
|
|
|
|
except Exception as e:
|
|
|
|
print('6------------------------------', e)
|
|
|
|
|
|
|
|
# 爬取宁波中基国际招标网的信息
|
|
|
|
print('开始获取宁波中基国际招标网的信息\n')
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": "22", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "23", "announcementType":"结果公告"}
|
|
|
|
]
|
|
|
|
|
|
|
|
for typeParam in infoType:
|
|
|
|
for page in range(1, 6):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_cbbidding(page, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('7--------------------------------', e)
|
|
|
|
|
|
|
|
# 爬取浙江国际招标网的信息
|
|
|
|
print('开始获取浙江国际招标网的信息\n')
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": "Zbgg", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "Gzgg", "announcementType":"更正公告"},
|
|
|
|
{"announcementCode": "jggg", "announcementType":"结果公告"}
|
|
|
|
]
|
|
|
|
|
|
|
|
for typeParam in infoType:
|
|
|
|
for page in range(1, 5):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_zmeetb(page, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('8----------------------------', e)
|
|
|
|
|
|
|
|
|
|
|
|
# 爬取宁波市国际招标有限公司网站
|
|
|
|
print('开始获取宁波国际招标网的信息\n')
|
|
|
|
|
|
|
|
# 定义要传递进去的关于公告信息类型的数据结构
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": "1", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "1", "announcementType":"结果公告"},
|
|
|
|
{"announcementCode": "2", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "2", "announcementType":"结果公告"}
|
|
|
|
]
|
|
|
|
for typeParam in infoType:
|
|
|
|
for page in range(1, 5):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_nbbidding(page, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('9--------------------------------', e)
|
|
|
|
|
|
|
|
# 爬取宁波名诚招标代理有限公司网站
|
|
|
|
print('开始获取宁波名城招标的信息\n')
|
|
|
|
|
|
|
|
# 定义要传递进去的关于公告信息类型的数据结构
|
|
|
|
infoType = [
|
|
|
|
{"announcementCode": "99", "announcementType":"采购公告"},
|
|
|
|
{"announcementCode": "88", "announcementType":"结果公告"}
|
|
|
|
]
|
|
|
|
for typeParam in infoType:
|
|
|
|
for page in range(1, 2):
|
|
|
|
try:
|
|
|
|
self.CrawlPage_nbmcbidding(page, typeParam)
|
|
|
|
except Exception as e:
|
|
|
|
print('10--------------------------------', e)
|
|
|
|
|
|
|
|
|
|
|
|
# 宁波中基国际招标有限公司 https://www.cbbidding.com/
|
|
|
|
def CrawlPage_cbbidding(self, page, typeParam):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
session = HTMLSession()
|
|
|
|
session.DEFAULT_RETRIES = 5
|
|
|
|
url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page)
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
|
|
"Accept-Encoding": "gzip, deflate",
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"DNT": '1',
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# 这个网站返回的是一个网页,所以需要进行网页解析
|
|
|
|
r = session.get(url = url, headers = headers)
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element
|
|
|
|
data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li')
|
|
|
|
for item in data:
|
|
|
|
title = item.xpath('//a')[0].text
|
|
|
|
url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href')
|
|
|
|
region = '中基招标'
|
|
|
|
publishDate = item.xpath('//div')[0].text
|
|
|
|
|
|
|
|
try:
|
|
|
|
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
|
|
|
|
except Exception as e:
|
|
|
|
publishDate = publishDate.replace('.', '-')
|
|
|
|
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
|
|
|
|
|
|
|
|
print(url, title)
|
|
|
|
announcementType = typeParam['announcementType']
|
|
|
|
#print(title, url, region, publishDate, announcementType)
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
|
|
|
|
# 浙江国际招投标有限公司 https://www.zmeetb.com/
|
|
|
|
def CrawlPage_zmeetb(self, page, typeParam):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
session = HTMLSession()
|
|
|
|
url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/' + str(page) + '.html'
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
|
|
"Cache-Control": "max-age=0",
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
"Connection": "close",
|
|
|
|
"DNT": '1',
|
|
|
|
"Host": "www.zmeetb.com",
|
|
|
|
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"',
|
|
|
|
"sec-ch-ua-mobile": "?0",
|
|
|
|
"sec-ch-ua-platform": "Windows",
|
|
|
|
"Sec-Fetch-Dest": "document",
|
|
|
|
"Sec-Fetch-Mode": "navigate",
|
|
|
|
"Sec-Fetch-Site": "none",
|
|
|
|
"Sec-Fetch-User": "?1",
|
|
|
|
"Upgrade-Insecure-Requests": "1",
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
|
|
|
|
}
|
|
|
|
|
|
|
|
# 这个网站返回的是一个网页,所以需要进行网页解析
|
|
|
|
# 这个网站如果使用render()函数,会遇到ssl证书问题,需要进一步研究chromium浏览器的证书问题
|
|
|
|
#r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com')
|
|
|
|
r = session.get(url = url, headers = headers, verify=False)
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element
|
|
|
|
data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a')
|
|
|
|
for item in data:
|
|
|
|
title = item.xpath('//p')[0].text
|
|
|
|
url = item.attrs.get('href')
|
|
|
|
region = '浙江国际招标'
|
|
|
|
publishDate = item.xpath('//p')[1].text
|
|
|
|
announcementType = typeParam['announcementType']
|
|
|
|
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
|
|
|
|
# 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/
|
|
|
|
def CrawlPage_nbmcbidding(self, page, typeParam):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
session = HTMLSession()
|
|
|
|
if typeParam['announcementType'] == '采购公告':
|
|
|
|
url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/"
|
|
|
|
else:
|
|
|
|
url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/"
|
|
|
|
|
|
|
|
|
|
|
|
data = {}
|
|
|
|
headers = {
|
|
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
|
|
|
|
"Host": "www.nbmcbidding.com",
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
|
|
|
|
}
|
|
|
|
|
|
|
|
r = session.get(url = url, headers = headers, json = data)
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element
|
|
|
|
data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li')
|
|
|
|
for item in data:
|
|
|
|
title = item.xpath('//a/div[2]')[0].text
|
|
|
|
url = item.xpath('//a')[0].attrs.get('href')
|
|
|
|
region = '宁波名诚招标'
|
|
|
|
publishDate = item.xpath('//a/div[4]')[0].text
|
|
|
|
announcementType = typeParam['announcementType']
|
|
|
|
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
|
|
|
|
# 宁波市国际招标有限公司 http://www.nbbidding.com/
|
|
|
|
def CrawlPage_nbbidding(self, page, typeParam):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
session = HTMLSession()
|
|
|
|
if typeParam['announcementType'] == '采购公告':
|
|
|
|
url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword"
|
|
|
|
|
|
|
|
else:
|
|
|
|
url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword"
|
|
|
|
|
|
|
|
|
|
|
|
data = {}
|
|
|
|
headers = {
|
|
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
|
|
"Host": "www.nbbidding.com",
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
|
|
|
|
}
|
|
|
|
|
|
|
|
r = session.get(url = url, headers = headers, json = data)
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
data = json.loads(r.text)['data']
|
|
|
|
total = data['page']['count']
|
|
|
|
data = data['list']
|
|
|
|
|
|
|
|
for item in data:
|
|
|
|
id = item['id']
|
|
|
|
if typeParam['announcementType'] == '采购公告':
|
|
|
|
url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id)
|
|
|
|
else:
|
|
|
|
url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id)
|
|
|
|
title = item['title']
|
|
|
|
region = '宁波国际招标'
|
|
|
|
publishDate = item['addtime']
|
|
|
|
announcementType = item['stage']
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
print(publishDate, title, url)
|
|
|
|
|
|
|
|
# 宁波市国资委属企业招标信息网
|
|
|
|
def CrawlPage_gzw_ningbo(self, page):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
session = HTMLSession()
|
|
|
|
url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page)
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
|
|
"Accept-Encoding": "gzip, deflate",
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"DNT": '1',
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
|
|
|
|
}
|
|
|
|
|
|
|
|
# 这个网站返回的是一个网页,所以需要进行网页解析
|
|
|
|
r = session.get(url = url, headers = headers)
|
|
|
|
r.html.render()
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element
|
|
|
|
data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li')
|
|
|
|
for item in data:
|
|
|
|
title = item.xpath('//a')[0].text
|
|
|
|
url = item.xpath('//a')[0].attrs.get('href')
|
|
|
|
region = '宁波市属国企'
|
|
|
|
publishDate = item.xpath('//p')[0].text
|
|
|
|
announcementType = '采购公告'
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 宁波市中介超市网
|
|
|
|
# 2024-03-29 更新
|
|
|
|
def CrawlPage_zjcs_nbxzfw(self, page, typeParam):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
# type 用于判别采购信息的类型
|
|
|
|
session = HTMLSession()
|
|
|
|
urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList']
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
|
|
"Accept-Encoding": "gzip, deflate",
|
|
|
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
"Connection": "keep-alive",
|
|
|
|
"DNT": '1',
|
|
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
|
|
|
|
}
|
|
|
|
payload = {
|
|
|
|
"page": page,
|
|
|
|
"pageSize": 10,
|
|
|
|
"center_id": "",
|
|
|
|
"bulletin_type_id": typeParam["announcementCode"]
|
|
|
|
}
|
|
|
|
|
|
|
|
for url in urllist:
|
|
|
|
r = session.post(url = url, headers = headers, json = payload)
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
data = json.loads(r.text)['body']
|
|
|
|
|
|
|
|
total = data['total']
|
|
|
|
data = data['data']['bulletinInfoList']
|
|
|
|
|
|
|
|
for item in data:
|
|
|
|
articleId = item['auto_id']
|
|
|
|
BulletinTypeId = typeParam["announcementCode"]
|
|
|
|
url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId)
|
|
|
|
title = item['bulletin_title']
|
|
|
|
region = '宁波中介超市'
|
|
|
|
publishDate = item['publish_date'].replace('T', ' ')
|
|
|
|
announcementType = typeParam['announcementType']
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
#print(publishDate, title, url)
|
|
|
|
|
|
|
|
# 宁波阳光采购网
|
|
|
|
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):
|
|
|
|
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode']
|
|
|
|
|
|
|
|
wait_for = '.ant-pagination-item-ellipsis'
|
|
|
|
page_element = '.anticon-right'
|
|
|
|
try:
|
|
|
|
r = Splash().post(url, wait_for, pages=pages, page_element=page_element)
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
|
|
|
|
results = json.loads(r.text)
|
|
|
|
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码:'+str(r.status_code), r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
for i in range(1, pages + 1):
|
|
|
|
data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li')
|
|
|
|
if len(data) == 0:
|
|
|
|
print('数据为空')
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e)
|
|
|
|
return False
|
|
|
|
|
|
|
|
for item in data:
|
|
|
|
url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href')
|
|
|
|
title = item.xpath('//a/span[3]')[0].text
|
|
|
|
region = '宁波阳光采购'
|
|
|
|
publishDate = item.xpath('//div[2]')[0].text
|
|
|
|
announcementType = typeParam['announcementType']
|
|
|
|
print(title)
|
|
|
|
self.write_information(title, url, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
|
|
|
|
# 浙江政府采购网
|
|
|
|
def CrawlPage_zfcg_czt_zj(self, page, typeParam):
|
|
|
|
# 这个方法是实际爬取指定页面的信息。
|
|
|
|
session = HTMLSession()
|
|
|
|
url = 'https://zfcg.czt.zj.gov.cn/portal/category'
|
|
|
|
if typeParam['announcementCode'] == '110-420383':
|
|
|
|
data = {
|
|
|
|
"pageNo": page,
|
|
|
|
"pageSize": 15,
|
|
|
|
"categoryCode": typeParam['announcementCode'],
|
|
|
|
"districtCode": ["339900"],
|
|
|
|
"isProvince": True,
|
|
|
|
"includeGovDistrict": "1",
|
|
|
|
"_t": 1699104836000
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
data = {
|
|
|
|
"pageNo": page,
|
|
|
|
"pageSize": 15,
|
|
|
|
"categoryCode": typeParam['announcementCode'],
|
|
|
|
"isGov": True,
|
|
|
|
"excludeDistrictPrefix": "90",
|
|
|
|
"_t": 1699104836000
|
|
|
|
}
|
|
|
|
|
|
|
|
headers = {
|
|
|
|
"accept": "application/json, text/plain, */*",
|
|
|
|
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
|
|
"content-type": "application/json;charset=UTF-8",
|
|
|
|
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"",
|
|
|
|
"sec-ch-ua-mobile": "?0",
|
|
|
|
"sec-ch-ua-platform": "\"Windows\"",
|
|
|
|
"sec-fetch-dest": "empty",
|
|
|
|
"sec-fetch-mode": "cors",
|
|
|
|
"sec-fetch-site": "same-origin",
|
|
|
|
"x-requested-with": "XMLHttpRequest"
|
|
|
|
}
|
|
|
|
|
|
|
|
try:
|
|
|
|
r = session.post(url = url, headers = headers, json = data)
|
|
|
|
except Exception as e:
|
|
|
|
print('10-------------------------', e)
|
|
|
|
|
|
|
|
if r.status_code != 200:
|
|
|
|
if page == 1:
|
|
|
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text)
|
|
|
|
return False
|
|
|
|
|
|
|
|
data = json.loads(r.text)['result']['data']
|
|
|
|
total = data['total']
|
|
|
|
data = data['data']
|
|
|
|
|
|
|
|
for item in data:
|
|
|
|
publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000)
|
|
|
|
pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c'
|
|
|
|
detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId']
|
|
|
|
announcementType = typeParam['announcementType']
|
|
|
|
if announcementType == '采购意向':
|
|
|
|
r = session.get(url = detailUrl, headers = headers)
|
|
|
|
|
|
|
|
detailData = json.loads(r.text)['result']['data']
|
|
|
|
if detailData == None:
|
|
|
|
break
|
|
|
|
|
|
|
|
content = HTML(html='<xml>'+detailData['content']+'</xml>')
|
|
|
|
region = item['districtName']
|
|
|
|
for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'):
|
|
|
|
title = detailItem.xpath('//td[2]')[0].text
|
|
|
|
cgxqqk = detailItem.xpath('//td[3]')[0].text
|
|
|
|
ysje = detailItem.xpath('//td[4]')[0].text
|
|
|
|
yjcgsj = detailItem.xpath('//td[5]')[0].text
|
|
|
|
ly = detailData["title"]
|
|
|
|
|
|
|
|
self.write_information(title, pageUrl, region, publishDate, announcementType)
|
|
|
|
self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly})
|
|
|
|
else:
|
|
|
|
title = item['title']
|
|
|
|
region = item['districtName']
|
|
|
|
self.write_information(title, pageUrl, region, publishDate, announcementType)
|
|
|
|
|
|
|
|
#print(publishDate, url)
|
|
|
|
|
|
|
|
|
|
|
|
return True
|