ShouQianPingTai/crawler.py


								#!/usr/bin/python3

								"""

								===========================================================================================

								文件名称：crawler.py

								这是一个用于爬取采购信息的模块

								要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表

								===========================================================================================

								class Crawler:

								    def __init__(self, connect):

								    def generate_id(self):

								    def write_log_information(self, data_id, catalog_name):

								    def CrawlPage_gzw_ningbo(self, page):           # 宁波国资委市属国企招标投标信息

								    def CrawlPage_zjcs_nbxzfw(self, type, page):    # 宁波市中介超市

								    def CrawlPage_ygcg_nbcqjy_org(self, page):      # 宁波市阳光采购

								    def CrawlPage_zfcg_czt_zj(self, page):          # 浙江政府采购网

								    def CrawlPage_cbbidding(self, page):            # 宁波中基国际招标有限公司

								    def CrawlPage_zmeetb(self, page):               # 浙江国际招标有限公司

								    def CrawlPage_nbbidding(self, page):            # 宁波国际招标有限公司

								============================================================================================

								"""


								import datetime

								import hashlib

								import pymysql

								import json

								import random

								from requests_html import HTMLSession

								from requests_html import HTML, UserAgent

								import gymailer

								import time


								'''

								============================================================

								 这个类用来封装splash服务

								 其中:

								   self.splash_ip 参数是splash服务的ip

								============================================================

								'''


								class Splash:

								    def __init__(self):

								        self.splash_ip = '127.0.0.1'


								    '''

								    ============================================================

								       wait_for参数用来制定需要等待的元素，只有该元素渲染完成，程序才能染回，否则将等待200秒。wait_for 参数采购选择器的方式，如

								       如制定元素id, 采用“#app"形式，如制定元素class, 采用 '.class-name'形式。

								    ============================================================

								    '''

								    def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}):

								        lua_scripts = """

								function wait_for_element(splash, css, maxwait)

								  -- Wait until a selector matches an element

								  -- in the page. Return an error if waited more

								  -- than maxwait seconds.

								  if maxwait == nil then

								      maxwait = 10

								  end

								  return splash:wait_for_resume(string.format([[

								    function main(splash) {

								      var selector = '%s';

								      var maxwait = %s;

								      var end = Date.now() + maxwait*1000;


								      function check() {

								        if(document.querySelector(selector)) {

								          splash.resume('Element found');

								        } else if(Date.now() >= end) {

								          var err = 'Timeout waiting for element';

								          splash.error(err + " " + selector);

								        } else {

								          setTimeout(check, 200);

								        }

								      }

								      check();

								    }

								  ]], css, maxwait))

								end


								function main(splash, args)

								  pages = """ + str(pages) + """

								  page_element = '""" + page_element + """'

								  wait_for = '""" + wait_for + """'

								  splash:go('""" + url + """')

								  wait_for_element(splash, wait_for)

								  wait_for_element(splash, page_element)


								  -- 将第一页的结果加入返回结果集中

								  results = {splash.html()}


								  if pages == 1 then

								    return results

								  else

								    -- 执行翻页动作

								    -- 先页面上的翻页元件（element），然后发送点击事件（click()）翻页

								    for i = 2, pages do

								      -- js 中是javascript脚本，用于获取翻页的元件，并发送click事件

								      js = string.format("document.querySelector('%s').click();", page_element)


								      -- 执行翻页脚本

								      splash:runjs(js)


								      -- 等待页面加载完成

								      wait_for_element(splash, wait_for)

								      wait_for_element(splash, page_element)


								      -- 这个地方看来必须加上延时，否则页面加载不完全，可能还没有完成页面更新

								      assert(splash:wait(5))


								      -- 将页面加入返回结果集中

								      table.insert(results, splash.html())

								    end

								    return results

								  end

								end

								        """


								        splash_url = 'http://' + self.splash_ip + ':8050/execute'

								        data = json.dumps({'lua_source':lua_scripts})

								        r = HTMLSession().post(splash_url, headers=headers, data=data)

								        return r


								class Crawler:

								    def __init__(self, connect):

								        self.connect = connect


								    def generate_id(self):

								        # 用于生成一个32位的ID号

								        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000))

								        md5_hash = hashlib.md5()

								        md5_hash.update(current_time.encode('utf-8'))

								        return md5_hash.hexdigest()


								    def write_log_information(self, data_id, catalog_name, log_type='采购公告'):

								        # 添加了一条信息，需要同步更新其他相关信息, 包含对话框信息和日志信息两项

								        with self.connect.cursor() as cursor:

								            affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type))

								            if affected_row == 0:

								                return False


								            result = cursor.fetchall()

								            catalog_id = result[0][0]

								            catalogdata_id = self.generate_id()

								            readlog_id = self.generate_id()


								            affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'")

								            if affected_row == 0:

								                return False


								            result = cursor.fetchall()

								            staff_id = result[0][0]

								            add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")


								            affected_row = cursor.execute(

								                    'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)',

								                    (catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0))


								            cursor.execute(

								                    'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)',

								                    (readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id))


								        return True


								    def write_information(self, title, url, region, publishTime, announcementType):

								        # 用于将一条信息写入数据库中

								        with self.connect.cursor() as cursor:

								            cggg_id = self.generate_id()


								            try:

								                title = title.replace("'", "\\\'")

								                affected_rows = cursor.execute(

								                            'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)',

								                            (cggg_id, title, url, region, publishTime, announcementType))

								            except pymysql.err.IntegrityError:

								                print('信息重复')

								                self.connect.rollback()

								                return False

								            else:

								                if self.write_log_information(cggg_id, announcementType):

								                    self.connect.commit()

								                else:

								                    print('添加采购信息失败')

								                    self.connect.rollback()

								                    return False


								        return True


								    def write_information_cgyx(self, cgyx):

								        # 用于将一条信息写入数据库中


								        with self.connect.cursor() as cursor:

								            cgyx_id = self.generate_id()

								            cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'")

								            strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')'

								            try:

								                affected_rows = cursor.execute(strSql)

								            except pymysql.err.IntegrityError:

								                print('信息重复')

								                #self.connect.rollback()

								                return False

								            else:

								                if self.write_log_information(cgyx_id, '采购意向'):

								                    self.connect.commit()

								                else:

								                    print('添加采购信息失败')

								                    self.connect.rollback()

								                    return False


								        return True


								    def Check(self):

								        with self.connect.cursor() as cursor:

								            affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);")

								            if affected_row == 0:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常，请检查！')

								                return False

								            else:

								                return True


								    def Crawl(self):

								        # 这个方法是实际完成爬取工作的总入口。


								        # 爬取浙江政采网的信息

								        print('开始获取浙江政采网的信息\n')


								        # 定义要传递进去的关于公告信息类型的数据结构

								        infoType = [

								            #{"announcementCode": "110-175885", "announcementType":"采购意向"},

								            #{"announcementCode": "110-978863", "announcementType":"采购公告"},

								            #{"announcementCode": "110-943756", "announcementType":"更正公告"},

								            {"announcementCode": "110-774650", "announcementType":"非政府采购公告"},

								            #{"announcementCode": "110-900461", "announcementType":"结果公告"}

								        ]

								        for typeParam in infoType:

								            for page in range(1, 70):

								                try:

								                    self.CrawlPage_zfcg_czt_zj(page, typeParam)

								                except Exception as e:

								                    print('3--------------------------------', e)


								        # 爬取宁波市阳光采购网的信息

								        print('开始获取宁波市阳光采购网的信息\n')

								        infoType = [

								            {"announcementCode": "21", "announcementType":"采购公告"},

								            {"announcementCode": "23", "announcementType":"更正公告"},

								            {"announcementCode": "22", "announcementType":"结果公告"}

								        ]

								        for typeParam in infoType:

								            try:

								                self.CrawlPage_ygcg_nbcqjy_org(2, typeParam)

								            except Exception as e:

								                print('4--------------------------------', e)


								        # 爬取宁波市中介超市网的信息

								        print('开始获取宁波市中介超市网的信息\n')

								        infoType = [

								            {"announcementCode": '10', "announcementType":"业务需求公告"},

								            {"announcementCode": '11', "announcementType":"业务需求补充公告"},

								            {"announcementCode": '20', "announcementType":"中选结果公告"},

								            {"announcementCode": '21', "announcementType":"中选结果补充公告"},

								            {"announcementCode": '22', "announcementType":"中选结果补充公告"}

								        ]


								        for typeParam in infoType:

								            for page in range(1, 6):

								                try:

								                    self.CrawlPage_zjcs_nbxzfw(page, typeParam)

								                except Exception as e:

								                    print('5------------------------------', e)


								        # 爬取宁波市国资委市属企业采购信息

								        print('开始获取宁波市国资委市属企业招投标网的信息\n')

								        for page in range(1, 5):

								            try:

								                self.CrawlPage_gzw_ningbo(page)

								            except Exception as e:

								                print('6------------------------------', e)


								        # 爬取宁波中基国际招标网的信息

								        print('开始获取宁波中基国际招标网的信息\n')

								        infoType = [

								            {"announcementCode": "22", "announcementType":"采购公告"},

								            {"announcementCode": "23", "announcementType":"结果公告"}

								        ]


								        for typeParam in infoType:

								            for page in range(1, 6):

								                try:

								                    self.CrawlPage_cbbidding(page, typeParam)

								                except Exception as e:

								                    print('7--------------------------------', e)


								        # 爬取浙江国际招标网的信息

								        print('开始获取浙江国际招标网的信息\n')

								        infoType = [

								            {"announcementCode": "Zbgg", "announcementType":"采购公告"},

								            {"announcementCode": "Gzgg", "announcementType":"更正公告"},

								            {"announcementCode": "jggg", "announcementType":"结果公告"}

								        ]


								        for typeParam in infoType:

								            for page in range(1, 5):

								                try:

								                    self.CrawlPage_zmeetb(page, typeParam)

								                except Exception as e:

								                    print('8----------------------------', e)


								        # 爬取宁波市国际招标有限公司网站

								        print('开始获取宁波国际招标网的信息\n')


								        # 定义要传递进去的关于公告信息类型的数据结构

								        infoType = [

								            {"announcementCode": "1", "announcementType":"采购公告"},

								            {"announcementCode": "1", "announcementType":"结果公告"},

								            {"announcementCode": "2", "announcementType":"采购公告"},

								            {"announcementCode": "2", "announcementType":"结果公告"}

								        ]

								        for typeParam in infoType:

								            for page in range(1, 5):

								                try:

								                    self.CrawlPage_nbbidding(page, typeParam)

								                except Exception as e:

								                    print('9--------------------------------', e)


								        # 爬取宁波名诚招标代理有限公司网站

								        print('开始获取宁波名城招标的信息\n')


								        # 定义要传递进去的关于公告信息类型的数据结构

								        infoType = [

								            {"announcementCode": "99", "announcementType":"采购公告"},

								            {"announcementCode": "88", "announcementType":"结果公告"}

								        ]

								        for typeParam in infoType:

								            for page in range(1, 2):

								                try:

								                    self.CrawlPage_nbmcbidding(page, typeParam)

								                except Exception as e:

								                    print('10--------------------------------', e)


								    # 宁波中基国际招标有限公司    https://www.cbbidding.com/

								    def CrawlPage_cbbidding(self, page, typeParam):

								        # 这个方法是实际爬取指定页面的信息。

								        session = HTMLSession()

								        session.DEFAULT_RETRIES = 5

								        url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page)


								        headers =  {

								            "Accept": "application/json, text/javascript, */*; q=0.01",

								            "Accept-Encoding": "gzip, deflate",

								            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",

								            "Connection": "keep-alive",

								            "DNT": '1',

								            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"

								        }


								        # 这个网站返回的是一个网页，所以需要进行网页解析

								        r = session.get(url = url, headers = headers)


								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text)

								            return False


								        # 注意：xpath 函数返回的是list对象, 对象的元素是element

								        data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li')

								        for item in data:

								            title = item.xpath('//a')[0].text

								            url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href')

								            region = '中基招标'

								            publishDate = item.xpath('//div')[0].text


								            try:

								                publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))

								            except Exception as e:

								                publishDate = publishDate.replace('.', '-')

								                publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))


								            print(url, title)

								            announcementType = typeParam['announcementType']

								            print(publishDate, url, page, title)

								            self.write_information(title, url, region, publishDate, announcementType)


								    # 浙江国际招投标有限公司 https://www.zmeetb.com/

								    def CrawlPage_zmeetb(self, page, typeParam):

								        # 这个方法是实际爬取指定页面的信息。

								        session = HTMLSession()

								        url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/'  + str(page) + '.html'


								        headers =  {

								            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",

								            "Accept-Encoding": "gzip, deflate, br",

								            "Cache-Control": "max-age=0",

								            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",

								            "Connection": "close",

								            "DNT": '1',

								            "Host": "www.zmeetb.com",

								            "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"',

								            "sec-ch-ua-mobile": "?0",

								            "sec-ch-ua-platform": "Windows",

								            "Sec-Fetch-Dest": "document",

								            "Sec-Fetch-Mode": "navigate",

								            "Sec-Fetch-Site": "none",

								            "Sec-Fetch-User": "?1",

								            "Upgrade-Insecure-Requests": "1",

								            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"

								        }


								        # 这个网站返回的是一个网页，所以需要进行网页解析

								        # 这个网站如果使用render()函数，会遇到ssl证书问题，需要进一步研究chromium浏览器的证书问题

								        #r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com')

								        r = session.get(url = url, headers = headers, verify=False)


								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text)

								            return False


								        # 注意：xpath 函数返回的是list对象, 对象的元素是element

								        data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a')

								        for item in data:

								            title = item.xpath('//p')[0].text

								            url = item.attrs.get('href')

								            region = '浙江国际招标'

								            publishDate = item.xpath('//p')[1].text

								            announcementType = typeParam['announcementType']

								            print(publishDate, url, page, title)


								            self.write_information(title, url, region, publishDate, announcementType)


								    # 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/

								    def CrawlPage_nbmcbidding(self, page, typeParam):

								        # 这个方法是实际爬取指定页面的信息。

								        session = HTMLSession()

								        if typeParam['announcementType'] == '采购公告':

								            url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/"

								        else:

								            url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/"


								        data = {}

								        headers =  {

								            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",

								            "Host": "www.nbmcbidding.com",

								            'Connection': 'keep-alive',

								            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"

								        }


								        r = session.get(url = url, headers = headers, json = data)


								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text)

								            return False


								        # 注意：xpath 函数返回的是list对象, 对象的元素是element

								        data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li')

								        for item in data:

								            title = item.xpath('//a/div[2]')[0].text

								            url = item.xpath('//a')[0].attrs.get('href')

								            region = '宁波名诚招标'

								            publishDate = item.xpath('//a/div[4]')[0].text

								            announcementType = typeParam['announcementType']

								            print(publishDate, url, page, title)


								            self.write_information(title, url, region, publishDate, announcementType)


								    # 宁波市国际招标有限公司 http://www.nbbidding.com/

								    def CrawlPage_nbbidding(self, page, typeParam):

								        # 这个方法是实际爬取指定页面的信息。

								        session = HTMLSession()

								        if typeParam['announcementType'] == '采购公告':

								            url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword"


								        else:

								            url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword"


								        data = {}

								        headers =  {

								            "Accept": "application/json, text/javascript, */*; q=0.01",

								            "Host": "www.nbbidding.com",

								            'Connection': 'keep-alive',

								            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"

								        }


								        r = session.get(url = url, headers = headers, json = data)


								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text)

								            return False


								        data = json.loads(r.text)['data']

								        total = data['page']['count']

								        data = data['list']


								        for item in data:

								            id = item['id']

								            if typeParam['announcementType'] == '采购公告':

								                url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id)

								            else:

								                url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id)

								            title = item['title']

								            region = '宁波国际招标'

								            publishDate = item['addtime']

								            announcementType = item['stage']

								            print(publishDate, url, page, title)

								            self.write_information(title, url, region, publishDate, announcementType)


								    # 宁波市国资委属企业招标信息网

								    def CrawlPage_gzw_ningbo(self, page):

								        # 这个方法是实际爬取指定页面的信息。

								        session = HTMLSession()

								        url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page)


								        headers =  {

								            "Accept": "application/json, text/javascript, */*; q=0.01",

								            "Accept-Encoding": "gzip, deflate",

								            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",

								            "Connection": "keep-alive",

								            "DNT": '1',

								            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"

								        }


								        # 这个网站返回的是一个网页，所以需要进行网页解析

								        r = session.get(url = url, headers = headers)

								        r.html.render()


								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text)

								            return False


								        # 注意：xpath 函数返回的是list对象, 对象的元素是element

								        data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li')

								        for item in data:

								            title = item.xpath('//a')[0].text

								            url = item.xpath('//a')[0].attrs.get('href')

								            region = '宁波市属国企'

								            publishDate = item.xpath('//p')[0].text

								            announcementType = '采购公告'

								            print(publishDate, url, page, title)

								            self.write_information(title, url, region, publishDate, announcementType)


								    # 宁波市中介超市网

								    # 2024-03-29 更新

								    def CrawlPage_zjcs_nbxzfw(self, page, typeParam):

								        # 这个方法是实际爬取指定页面的信息。

								        # type 用于判别采购信息的类型

								        session = HTMLSession()

								        urllist = ['https://zjcs.zwb.ningbo.gov.cn/siteapi/api/Portal/GetBulletinInfoList']


								        headers =  {

								            "Accept": "application/json, text/javascript, */*; q=0.01",

								            "Accept-Encoding": "gzip, deflate",

								            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",

								            "Connection": "keep-alive",

								            "DNT": '1',

								            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"

								        }

								        payload = {

								                   "page": page,

								                   "pageSize": 10,

								                   "bulletin_type_id": typeParam["announcementCode"]

								               }


								        for url in urllist:

								            r = session.post(url = url, headers = headers, json = payload)


								            if r.status_code != 200:

								                print("error")

								                if page == 1:

								                    gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)

								                return False


								            data = json.loads(r.text)['body']


								            total = data['total']

								            data = data['data']['bulletinInfoList']


								            for item in data:

								                articleId = item['auto_id']

								                BulletinTypeId = typeParam["announcementCode"]

								                url = 'https://zjcs.zwb.ningbo.gov.cn/gDetails?id=%s' % (articleId)

								                title = item['bulletin_title']

								                region = '宁波中介超市'

								                publishDate = item['publish_date'].replace('T', ' ')

								                announcementType = typeParam['announcementType']

								                print(publishDate, url, page, title)

								                self.write_information(title, url, region, publishDate, announcementType)


								    # 宁波阳光采购网

								    def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):

								        url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType=' + typeParam['announcementCode']


								        wait_for = '.ant-pagination-item-ellipsis'

								        page_element = '.anticon-right'

								        try:

								            r = Splash().post(url, wait_for, pages=pages, page_element=page_element)

								        except Exception as e:

								            print(e)


								        results = json.loads(r.text)


								        # 这个方法是实际爬取指定页面的信息。

								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码：'+str(r.status_code), r.text)

								            return False


								        for i in range(1, pages + 1):

								            data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li')

								            if len(data) == 0:

								                print('数据为空')

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e)

								                return False


								            for item in data:

								                url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href')

								                title = item.xpath('//a/span[3]')[0].text

								                region = '宁波阳光采购'

								                publishDate = item.xpath('//div[2]')[0].text

								                announcementType = typeParam['announcementType']

								                print(publishDate, url, page, title)

								                self.write_information(title, url, region, publishDate, announcementType)


								    # 浙江政府采购网

								    def CrawlPage_zfcg_czt_zj(self, page, typeParam):

								        # 这个方法是实际爬取指定页面的信息。

								        session = HTMLSession()

								        url = 'https://zfcg.czt.zj.gov.cn/portal/category'

								        if typeParam['announcementCode'] == '110-420383':

								            data = {

								                "pageNo": page,

								                "pageSize": 15,

								                "categoryCode": typeParam['announcementCode'],

								                "districtCode": ["339900"],

								                "isProvince": True,

								                "includeGovDistrict": "1",

								                "_t": 1699104836000

								            }

								        else:

								            data = {

								                "pageNo": page,

								                "pageSize": 15,

								                "categoryCode": typeParam['announcementCode'],

								                "isGov": True,

								                "excludeDistrictPrefix": "90",

								                "_t": 1699104836000

								            }


								        headers =  {

								            "accept": "application/json, text/plain, */*",

								            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",

								            "content-type": "application/json;charset=UTF-8",

								            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"",

								            "sec-ch-ua-mobile": "?0",

								            "sec-ch-ua-platform": "\"Windows\"",

								            "sec-fetch-dest": "empty",

								            "sec-fetch-mode": "cors",

								            "sec-fetch-site": "same-origin",

								            "x-requested-with": "XMLHttpRequest"

								        }


								        try:

								            r = session.post(url = url, headers = headers, json = data)

								        except Exception as e:

								            print('10-------------------------', e)


								        if r.status_code != 200:

								            if page == 1:

								                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text)

								            return False


								        data = json.loads(r.text)['result']['data']

								        total = data['total']

								        data = data['data']


								        for item in data:

								            publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000)

								            pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c'

								            detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId']

								            announcementType = typeParam['announcementType']

								            if announcementType == '采购意向':

								                r = session.get(url = detailUrl, headers = headers)


								                detailData = json.loads(r.text)['result']['data']

								                if detailData == None:

								                    break


								                content = HTML(html='<xml>'+detailData['content']+'</xml>')

								                region = item['districtName']

								                for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'):

								                    title = detailItem.xpath('//td[2]')[0].text

								                    cgxqqk = detailItem.xpath('//td[3]')[0].text

								                    ysje = detailItem.xpath('//td[4]')[0].text

								                    yjcgsj = detailItem.xpath('//td[5]')[0].text

								                    ly = detailData["title"]


								                    self.write_information(title, pageUrl, region, publishDate, announcementType)

								                    self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly})

								            else:

								                title = item['title']

								                region = item['districtName']

								                print(publishDate, url, page, title)

								                self.write_information(title, pageUrl, region, publishDate, announcementType)


								            #print(publishDate, url)


								        return True