#!/usr/bin/python3 """ =========================================================================================== 文件名称:crawler.py 这是一个用于爬取采购信息的模块 要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表 =========================================================================================== class Crawler: def __init__(self, connect): def generate_id(self): def write_log_information(self, data_id, catalog_name): def CrawlPage_gzw_ningbo(self, page): # 宁波国资委市属国企招标投标信息 def CrawlPage_zjcs_nbxzfw(self, type, page): # 宁波市中介超市 def CrawlPage_ygcg_nbcqjy_org(self, page): # 宁波市阳光采购 def CrawlPage_zfcg_czt_zj(self, page): # 浙江政府采购网 def CrawlPage_cbbidding(self, page): # 宁波中基国际招标有限公司 def CrawlPage_zmeetb(self, page): # 浙江国际招标有限公司 def CrawlPage_nbbidding(self, page): # 宁波国际招标有限公司 ============================================================================================ """ import datetime import hashlib import pymysql import json import random from requests_html import HTMLSession from requests_html import HTML, UserAgent import gymailer import time ''' ============================================================ 这个类用来封装splash服务 其中: self.splash_ip 参数是splash服务的ip ============================================================ ''' class Splash: def __init__(self): self.splash_ip = '127.0.0.1' ''' ============================================================ wait_for参数用来制定需要等待的元素,只有该元素渲染完成,程序才能染回,否则将等待200秒。wait_for 参数采购选择器的方式,如 如制定元素id, 采用“#app"形式,如制定元素class, 采用 '.class-name'形式。 ============================================================ ''' def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}): lua_scripts = """ function wait_for_element(splash, css, maxwait) -- Wait until a selector matches an element -- in the page. Return an error if waited more -- than maxwait seconds. if maxwait == nil then maxwait = 10 end return splash:wait_for_resume(string.format([[ function main(splash) { var selector = '%s'; var maxwait = %s; var end = Date.now() + maxwait*1000; function check() { if(document.querySelector(selector)) { splash.resume('Element found'); } else if(Date.now() >= end) { var err = 'Timeout waiting for element'; splash.error(err + " " + selector); } else { setTimeout(check, 200); } } check(); } ]], css, maxwait)) end function main(splash, args) pages = """ + str(pages) + """ page_element = '""" + page_element + """' wait_for = '""" + wait_for + """' splash:go('""" + url + """') wait_for_element(splash, wait_for) wait_for_element(splash, page_element) -- 将第一页的结果加入返回结果集中 results = {splash.html()} if pages == 1 then return results else -- 执行翻页动作 -- 先页面上的翻页元件(element),然后发送点击事件(click())翻页 for i = 2, pages do -- js 中是javascript脚本,用于获取翻页的元件,并发送click事件 js = string.format("document.querySelector('%s').click();", page_element) -- 执行翻页脚本 splash:runjs(js) -- 等待页面加载完成 wait_for_element(splash, wait_for) wait_for_element(splash, page_element) -- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新 assert(splash:wait(5)) -- 将页面加入返回结果集中 table.insert(results, splash.html()) end return results end end """ splash_url = 'http://' + self.splash_ip + ':8050/execute' data = json.dumps({'lua_source':lua_scripts}) r = HTMLSession().post(splash_url, headers=headers, data=data) return r class Crawler: def __init__(self, connect): self.connect = connect def generate_id(self): # 用于生成一个32位的ID号 current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000)) md5_hash = hashlib.md5() md5_hash.update(current_time.encode('utf-8')) return md5_hash.hexdigest() def write_log_information(self, data_id, catalog_name, log_type='采购公告'): # 添加了一条信息,需要同步更新其他相关信息, 包含对话框信息和日志信息两项 with self.connect.cursor() as cursor: affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type)) if affected_row == 0: return False result = cursor.fetchall() catalog_id = result[0][0] catalogdata_id = self.generate_id() readlog_id = self.generate_id() affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'") if affected_row == 0: return False result = cursor.fetchall() staff_id = result[0][0] add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") affected_row = cursor.execute( 'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)', (catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0)) cursor.execute( 'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)', (readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id)) return True def write_information(self, title, url, region, publishTime, announcementType): # 用于将一条信息写入数据库中 with self.connect.cursor() as cursor: cggg_id = self.generate_id() try: title = title.replace("'", "\\\'") affected_rows = cursor.execute( 'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)', (cggg_id, title, url, region, publishTime, announcementType)) except pymysql.err.IntegrityError: print('信息重复') self.connect.rollback() return False else: if self.write_log_information(cggg_id, announcementType): self.connect.commit() else: print('添加采购信息失败') self.connect.rollback() return False return True def write_information_cgyx(self, cgyx): # 用于将一条信息写入数据库中 with self.connect.cursor() as cursor: cgyx_id = self.generate_id() cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'") strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')' try: affected_rows = cursor.execute(strSql) except pymysql.err.IntegrityError: print('信息重复') #self.connect.rollback() return False else: if self.write_log_information(cgyx_id, '采购意向'): self.connect.commit() else: print('添加采购信息失败') self.connect.rollback() return False return True def Check(self): with self.connect.cursor() as cursor: affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);") if affected_row == 0: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常,请检查!') return False else: return True def Crawl(self): # 这个方法是实际完成爬取工作的总入口。 # 爬取浙江政采网的信息 print('开始获取浙江政采网的信息\n') # 定义要传递进去的关于公告信息类型的数据结构 infoType = [ #{"announcementCode": "110-175885", "announcementType":"采购意向"}, #{"announcementCode": "110-978863", "announcementType":"采购公告"}, #{"announcementCode": "110-943756", "announcementType":"更正公告"}, {"announcementCode": "110-774650", "announcementType":"非政府采购公告"}, #{"announcementCode": "110-900461", "announcementType":"结果公告"} ] for typeParam in infoType: for page in range(1, 70): try: self.CrawlPage_zfcg_czt_zj(page, typeParam) except Exception as e: print('3--------------------------------', e) # 爬取宁波市阳光采购网的信息 print('开始获取宁波市阳光采购网的信息\n') infoType = [ {"announcementCode": "21", "announcementType":"采购公告"}, {"announcementCode": "23", "announcementType":"更正公告"}, {"announcementCode": "22", "announcementType":"结果公告"} ] for typeParam in infoType: try: self.CrawlPage_ygcg_nbcqjy_org(2, typeParam) except Exception as e: print('4--------------------------------', e) # 爬取宁波市中介超市网的信息 print('开始获取宁波市中介超市网的信息\n') infoType = [ {"announcementCode": '10', "announcementType":"业务需求公告"}, {"announcementCode": '11', "announcementType":"业务需求补充公告"}, {"announcementCode": '20', "announcementType":"中选结果公告"}, {"announcementCode": '21', "announcementType":"中选结果补充公告"}, {"announcementCode": '22', "announcementType":"中选结果补充公告"} ] for typeParam in infoType: for page in range(1, 6): try: self.CrawlPage_zjcs_nbxzfw(page, typeParam) except Exception as e: print('5------------------------------', e) # 爬取宁波市国资委市属企业采购信息 print('开始获取宁波市国资委市属企业招投标网的信息\n') for page in range(1, 5): try: self.CrawlPage_gzw_ningbo(page) except Exception as e: print('6------------------------------', e) # 爬取宁波中基国际招标网的信息 print('开始获取宁波中基国际招标网的信息\n') infoType = [ {"announcementCode": "22", "announcementType":"采购公告"}, {"announcementCode": "23", "announcementType":"结果公告"} ] for typeParam in infoType: for page in range(1, 6): try: self.CrawlPage_cbbidding(page, typeParam) except Exception as e: print('7--------------------------------', e) # 爬取浙江国际招标网的信息 print('开始获取浙江国际招标网的信息\n') infoType = [ {"announcementCode": "Zbgg", "announcementType":"采购公告"}, {"announcementCode": "Gzgg", "announcementType":"更正公告"}, {"announcementCode": "jggg", "announcementType":"结果公告"} ] for typeParam in infoType: for page in range(1, 5): try: self.CrawlPage_zmeetb(page, typeParam) except Exception as e: print('8----------------------------', e) # 爬取宁波市国际招标有限公司网站 print('开始获取宁波国际招标网的信息\n') # 定义要传递进去的关于公告信息类型的数据结构 infoType = [ {"announcementCode": "1", "announcementType":"采购公告"}, {"announcementCode": "1", "announcementType":"结果公告"}, {"announcementCode": "2", "announcementType":"采购公告"}, {"announcementCode": "2", "announcementType":"结果公告"} ] for typeParam in infoType: for page in range(1, 5): try: self.CrawlPage_nbbidding(page, typeParam) except Exception as e: print('9--------------------------------', e) # 爬取宁波名诚招标代理有限公司网站 print('开始获取宁波名城招标的信息\n') # 定义要传递进去的关于公告信息类型的数据结构 infoType = [ {"announcementCode": "99", "announcementType":"采购公告"}, {"announcementCode": "88", "announcementType":"结果公告"} ] for typeParam in infoType: for page in range(1, 2): try: self.CrawlPage_nbmcbidding(page, typeParam) except Exception as e: print('10--------------------------------', e) # 宁波中基国际招标有限公司 https://www.cbbidding.com/ def CrawlPage_cbbidding(self, page, typeParam): # 这个方法是实际爬取指定页面的信息。 session = HTMLSession() session.DEFAULT_RETRIES = 5 url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page) headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "DNT": '1', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } # 这个网站返回的是一个网页,所以需要进行网页解析 r = session.get(url = url, headers = headers) if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text) return False # 注意:xpath 函数返回的是list对象, 对象的元素是element data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li') for item in data: title = item.xpath('//a')[0].text url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href') region = '中基招标' publishDate = item.xpath('//div')[0].text try: publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d')) except Exception as e: publishDate = publishDate.replace('.', '-') publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d')) print(url, title) announcementType = typeParam['announcementType'] print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 浙江国际招投标有限公司 https://www.zmeetb.com/ def CrawlPage_zmeetb(self, page, typeParam): # 这个方法是实际爬取指定页面的信息。 session = HTMLSession() url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/' + str(page) + '.html' headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Cache-Control": "max-age=0", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "close", "DNT": '1', "Host": "www.zmeetb.com", "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "Windows", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } # 这个网站返回的是一个网页,所以需要进行网页解析 # 这个网站如果使用render()函数,会遇到ssl证书问题,需要进一步研究chromium浏览器的证书问题 #r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com') r = session.get(url = url, headers = headers, verify=False) if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text) return False # 注意:xpath 函数返回的是list对象, 对象的元素是element data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a') for item in data: title = item.xpath('//p')[0].text url = item.attrs.get('href') region = '浙江国际招标' publishDate = item.xpath('//p')[1].text announcementType = typeParam['announcementType'] print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/ def CrawlPage_nbmcbidding(self, page, typeParam): # 这个方法是实际爬取指定页面的信息。 session = HTMLSession() if typeParam['announcementType'] == '采购公告': url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/" else: url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/" data = {} headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Host": "www.nbmcbidding.com", 'Connection': 'keep-alive', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } r = session.get(url = url, headers = headers, json = data) if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text) return False # 注意:xpath 函数返回的是list对象, 对象的元素是element data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li') for item in data: title = item.xpath('//a/div[2]')[0].text url = item.xpath('//a')[0].attrs.get('href') region = '宁波名诚招标' publishDate = item.xpath('//a/div[4]')[0].text announcementType = typeParam['announcementType'] print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 宁波市国际招标有限公司 http://www.nbbidding.com/ def CrawlPage_nbbidding(self, page, typeParam): # 这个方法是实际爬取指定页面的信息。 session = HTMLSession() if typeParam['announcementType'] == '采购公告': url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword" else: url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword" data = {} headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Host": "www.nbbidding.com", 'Connection': 'keep-alive', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } r = session.get(url = url, headers = headers, json = data) if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text) return False data = json.loads(r.text)['data'] total = data['page']['count'] data = data['list'] for item in data: id = item['id'] if typeParam['announcementType'] == '采购公告': url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id) else: url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id) title = item['title'] region = '宁波国际招标' publishDate = item['addtime'] announcementType = item['stage'] print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 宁波市国资委属企业招标信息网 def CrawlPage_gzw_ningbo(self, page): # 这个方法是实际爬取指定页面的信息。 session = HTMLSession() url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page) headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "DNT": '1', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } # 这个网站返回的是一个网页,所以需要进行网页解析 r = session.get(url = url, headers = headers) r.html.render() if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text) return False # 注意:xpath 函数返回的是list对象, 对象的元素是element data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li') for item in data: title = item.xpath('//a')[0].text url = item.xpath('//a')[0].attrs.get('href') region = '宁波市属国企' publishDate = item.xpath('//p')[0].text announcementType = '采购公告' print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 宁波市中介超市网 # 2024-03-29 更新 def CrawlPage_zjcs_nbxzfw(self, page, typeParam): # 这个方法是实际爬取指定页面的信息。 # type 用于判别采购信息的类型 session = HTMLSession() urllist = ['https://zjcs.zwb.ningbo.gov.cn/siteapi/api/Portal/GetBulletinInfoList'] headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "DNT": '1', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" } payload = { "page": page, "pageSize": 10, "bulletin_type_id": typeParam["announcementCode"] } for url in urllist: r = session.post(url = url, headers = headers, json = payload) if r.status_code != 200: print("error") if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) return False data = json.loads(r.text)['body'] total = data['total'] data = data['data']['bulletinInfoList'] for item in data: articleId = item['auto_id'] BulletinTypeId = typeParam["announcementCode"] url = 'https://zjcs.zwb.ningbo.gov.cn/gDetails?id=%s' % (articleId) title = item['bulletin_title'] region = '宁波中介超市' publishDate = item['publish_date'].replace('T', ' ') announcementType = typeParam['announcementType'] print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 宁波阳光采购网 def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode'] wait_for = '.ant-pagination-item-ellipsis' page_element = '.anticon-right' try: r = Splash().post(url, wait_for, pages=pages, page_element=page_element) except Exception as e: print(e) results = json.loads(r.text) # 这个方法是实际爬取指定页面的信息。 if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码:'+str(r.status_code), r.text) return False for i in range(1, pages + 1): data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li') if len(data) == 0: print('数据为空') gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e) return False for item in data: url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href') title = item.xpath('//a/span[3]')[0].text region = '宁波阳光采购' publishDate = item.xpath('//div[2]')[0].text announcementType = typeParam['announcementType'] print(publishDate, url, page, title) self.write_information(title, url, region, publishDate, announcementType) # 浙江政府采购网 def CrawlPage_zfcg_czt_zj(self, page, typeParam): # 这个方法是实际爬取指定页面的信息。 session = HTMLSession() url = 'https://zfcg.czt.zj.gov.cn/portal/category' if typeParam['announcementCode'] == '110-420383': data = { "pageNo": page, "pageSize": 15, "categoryCode": typeParam['announcementCode'], "districtCode": ["339900"], "isProvince": True, "includeGovDistrict": "1", "_t": 1699104836000 } else: data = { "pageNo": page, "pageSize": 15, "categoryCode": typeParam['announcementCode'], "isGov": True, "excludeDistrictPrefix": "90", "_t": 1699104836000 } headers = { "accept": "application/json, text/plain, */*", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "content-type": "application/json;charset=UTF-8", "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "x-requested-with": "XMLHttpRequest" } try: r = session.post(url = url, headers = headers, json = data) except Exception as e: print('10-------------------------', e) if r.status_code != 200: if page == 1: gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text) return False data = json.loads(r.text)['result']['data'] total = data['total'] data = data['data'] for item in data: publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000) pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c' detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId'] announcementType = typeParam['announcementType'] if announcementType == '采购意向': r = session.get(url = detailUrl, headers = headers) detailData = json.loads(r.text)['result']['data'] if detailData == None: break content = HTML(html=''+detailData['content']+'') region = item['districtName'] for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'): title = detailItem.xpath('//td[2]')[0].text cgxqqk = detailItem.xpath('//td[3]')[0].text ysje = detailItem.xpath('//td[4]')[0].text yjcgsj = detailItem.xpath('//td[5]')[0].text ly = detailData["title"] self.write_information(title, pageUrl, region, publishDate, announcementType) self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly}) else: title = item['title'] region = item['districtName'] print(publishDate, url, page, title) self.write_information(title, pageUrl, region, publishDate, announcementType) #print(publishDate, url) return True