ShouQianPingTai/crawler.py

#!/usr/bin/python3
"""
===========================================================================================
文件名称：crawler.py
这是一个用于爬取采购信息的模块
要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表
===========================================================================================
class Crawler:
    def __init__(self, connect):
    def generate_id(self):
    def write_log_information(self, data_id, catalog_name):
    def CrawlPage_gzw_ningbo(self, page):           # 宁波国资委市属国企招标投标信息
    def CrawlPage_zjcs_nbxzfw(self, type, page):    # 宁波市中介超市
    def CrawlPage_ygcg_nbcqjy_org(self, page):      # 宁波市阳光采购
    def CrawlPage_zfcg_czt_zj(self, page):          # 浙江政府采购网
    def CrawlPage_cbbidding(self, page):            # 宁波中基国际招标有限公司
    def CrawlPage_zmeetb(self, page):               # 浙江国际招标有限公司
    def CrawlPage_nbbidding(self, page):            # 宁波国际招标有限公司
============================================================================================
"""

import datetime
import hashlib
import pymysql
import json
import random
from requests_html import HTMLSession
from requests_html import HTML, UserAgent
import gymailer
import time

'''
============================================================
 这个类用来封装splash服务
 其中:
   self.splash_ip 参数是splash服务的ip
============================================================
'''

class Splash:
    def __init__(self):
        self.splash_ip = '127.0.0.1'

    '''
    ============================================================
       wait_for参数用来制定需要等待的元素，只有该元素渲染完成，程序才能染回，否则将等待200秒。wait_for 参数采购选择器的方式，如
       如制定元素id, 采用“#app"形式，如制定元素class, 采用 '.class-name'形式。
    ============================================================
    '''
    def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}):
        lua_scripts = """
function wait_for_element(splash, css, maxwait)
  -- Wait until a selector matches an element
  -- in the page. Return an error if waited more
  -- than maxwait seconds.
  if maxwait == nil then
      maxwait = 10
  end
  return splash:wait_for_resume(string.format([[
    function main(splash) {
      var selector = '%s';
      var maxwait = %s;
      var end = Date.now() + maxwait*1000;

      function check() {
        if(document.querySelector(selector)) {
          splash.resume('Element found');
        } else if(Date.now() >= end) {
          var err = 'Timeout waiting for element';
          splash.error(err + " " + selector);
        } else {
          setTimeout(check, 200);
        }
      }
      check();
    }
  ]], css, maxwait))
end

function main(splash, args)
  pages = """ + str(pages) + """
  page_element = '""" + page_element + """'
  wait_for = '""" + wait_for + """'
  splash:go('""" + url + """')
  wait_for_element(splash, wait_for)
  wait_for_element(splash, page_element)
  
  -- 将第一页的结果加入返回结果集中
  results = {splash.html()}

  if pages == 1 then
    return results
  else
    -- 执行翻页动作
    -- 先页面上的翻页元件（element），然后发送点击事件（click()）翻页
    for i = 2, pages do
      -- js 中是javascript脚本，用于获取翻页的元件，并发送click事件
      js = string.format("document.querySelector('%s').click();", page_element)

      -- 执行翻页脚本
      splash:runjs(js)

      -- 等待页面加载完成
      wait_for_element(splash, wait_for)
      wait_for_element(splash, page_element)

      -- 这个地方看来必须加上延时，否则页面加载不完全，可能还没有完成页面更新
      assert(splash:wait(5))

      -- 将页面加入返回结果集中
      table.insert(results, splash.html())
    end
    return results
  end
end
        """

        splash_url = 'http://' + self.splash_ip + ':8050/execute'
        data = json.dumps({'lua_source':lua_scripts})
        r = HTMLSession().post(splash_url, headers=headers, data=data)
        return r


class Crawler:
    def __init__(self, connect):
        self.connect = connect

    def generate_id(self):
        # 用于生成一个32位的ID号
        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000))
        md5_hash = hashlib.md5()
        md5_hash.update(current_time.encode('utf-8'))
        return md5_hash.hexdigest()

    def write_log_information(self, data_id, catalog_name, log_type='采购公告'):
        # 添加了一条信息，需要同步更新其他相关信息, 包含对话框信息和日志信息两项
        with self.connect.cursor() as cursor:
            affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type))
            if affected_row == 0:
                return False

            result = cursor.fetchall()
            catalog_id = result[0][0]
            catalogdata_id = self.generate_id()
            readlog_id = self.generate_id()

            affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'")
            if affected_row == 0:
                return False

            result = cursor.fetchall()
            staff_id = result[0][0]
            add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            
            affected_row = cursor.execute(
                    'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)',
                    (catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0))

            cursor.execute(
                    'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)',
                    (readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id))

        return True
        

    def write_information(self, title, url, region, publishTime, announcementType):
        # 用于将一条信息写入数据库中
        with self.connect.cursor() as cursor:
            cggg_id = self.generate_id()


            try:
                title = title.replace("'", "\\\'")
                affected_rows = cursor.execute(
                            'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)',
                            (cggg_id, title, url, region, publishTime, announcementType))
            except pymysql.err.IntegrityError:
                print('信息重复')
                self.connect.rollback()
                return False
            else:
                if self.write_log_information(cggg_id, announcementType):
                    self.connect.commit()
                else:
                    print('添加采购信息失败')
                    self.connect.rollback()
                    return False

        return True

    def write_information_cgyx(self, cgyx):
        # 用于将一条信息写入数据库中

        with self.connect.cursor() as cursor:
            cgyx_id = self.generate_id()
            cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'")
            strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')'
            try:
                affected_rows = cursor.execute(strSql)
            except pymysql.err.IntegrityError:
                print('信息重复')
                #self.connect.rollback()
                return False
            else:
                if self.write_log_information(cgyx_id, '采购意向'):
                    self.connect.commit()
                else:
                    print('添加采购信息失败')
                    self.connect.rollback()
                    return False

        return True

    def Check(self):
        with self.connect.cursor() as cursor:
            affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);")
            if affected_row == 0:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常，请检查！')
                return False
            else:
                return True


    def Crawl(self):
        # 这个方法是实际完成爬取工作的总入口。

        # 爬取浙江政采网的信息
        print('开始获取浙江政采网的信息\n')
        
        # 定义要传递进去的关于公告信息类型的数据结构
        infoType = [
            {"announcementCode": "110-175885", "announcementType":"采购意向"},
            {"announcementCode": "110-978863", "announcementType":"采购公告"},
            {"announcementCode": "110-943756", "announcementType":"更正公告"},
            {"announcementCode": "110-774650", "announcementType":"非政府采购公告"},
            {"announcementCode": "110-900461", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 11):
                try:
                    self.CrawlPage_zfcg_czt_zj(page, typeParam)
                except Exception as e:
                    print('3--------------------------------', e)

        # 爬取宁波市阳光采购网的信息 
        print('开始获取宁波市阳光采购网的信息\n')
        infoType = [
            {"announcementCode": "21", "announcementType":"采购公告"},
            {"announcementCode": "23", "announcementType":"更正公告"},
            {"announcementCode": "22", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            try:
                self.CrawlPage_ygcg_nbcqjy_org(2, typeParam)
            except Exception as e:
                print('4--------------------------------', e)

        # 爬取宁波市中介超市网的信息 
        print('开始获取宁波市中介超市网的信息\n')
        infoType = [
            {"announcementCode": '10', "announcementType":"业务需求公告"},
            {"announcementCode": '11', "announcementType":"业务需求补充公告"},
            {"announcementCode": '20', "announcementType":"中选结果公告"},
            {"announcementCode": '21', "announcementType":"中选结果补充公告"},
            {"announcementCode": '22', "announcementType":"中选结果补充公告"}
        ]
         
        for typeParam in infoType:
            for page in range(1, 6):
                try:
                    self.CrawlPage_zjcs_nbxzfw(page, typeParam)
                except Exception as e:
                    print('5------------------------------', e)

        # 爬取宁波市国资委市属企业采购信息
        print('开始获取宁波市国资委市属企业招投标网的信息\n')
        for page in range(1, 5):
            try:
                self.CrawlPage_gzw_ningbo(page)
            except Exception as e:
                print('6------------------------------', e)
        
        # 爬取宁波中基国际招标网的信息 
        print('开始获取宁波中基国际招标网的信息\n')
        infoType = [
            {"announcementCode": "22", "announcementType":"采购公告"},
            {"announcementCode": "23", "announcementType":"结果公告"}
        ]
         
        for typeParam in infoType:
            for page in range(1, 6):
                try:
                    self.CrawlPage_cbbidding(page, typeParam)
                except Exception as e:
                    print('7--------------------------------', e)
        
        # 爬取浙江国际招标网的信息 
        print('开始获取浙江国际招标网的信息\n')
        infoType = [
            {"announcementCode": "Zbgg", "announcementType":"采购公告"},
            {"announcementCode": "Gzgg", "announcementType":"更正公告"},
            {"announcementCode": "jggg", "announcementType":"结果公告"}
        ]
         
        for typeParam in infoType:
            for page in range(1, 5):
                try:
                    self.CrawlPage_zmeetb(page, typeParam)
                except Exception as e:
                    print('8----------------------------', e)
        
        
        # 爬取宁波市国际招标有限公司网站
        print('开始获取宁波国际招标网的信息\n')

        # 定义要传递进去的关于公告信息类型的数据结构
        infoType = [
            {"announcementCode": "1", "announcementType":"采购公告"},
            {"announcementCode": "1", "announcementType":"结果公告"},
            {"announcementCode": "2", "announcementType":"采购公告"},
            {"announcementCode": "2", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 5):
                try:
                    self.CrawlPage_nbbidding(page, typeParam)
                except Exception as e:
                    print('9--------------------------------', e)

        # 爬取宁波名诚招标代理有限公司网站
        print('开始获取宁波名城招标的信息\n')

        # 定义要传递进去的关于公告信息类型的数据结构
        infoType = [
            {"announcementCode": "99", "announcementType":"采购公告"},
            {"announcementCode": "88", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 2):
                try:
                    self.CrawlPage_nbmcbidding(page, typeParam)
                except Exception as e:
                    print('10--------------------------------', e)


    # 宁波中基国际招标有限公司    https://www.cbbidding.com/
    def CrawlPage_cbbidding(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        session.DEFAULT_RETRIES = 5
        url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page)

        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "DNT": '1',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }


        # 这个网站返回的是一个网页，所以需要进行网页解析
        r = session.get(url = url, headers = headers)

        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text)
            return False

        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li')
        for item in data:
            title = item.xpath('//a')[0].text
            url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href')
            region = '中基招标'
            publishDate = item.xpath('//div')[0].text
            
            try:
                publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
            except Exception as e:
                publishDate = publishDate.replace('.', '-')
                publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))

            print(url, title)
            announcementType = typeParam['announcementType']
            #print(title, url, region, publishDate, announcementType)
            self.write_information(title, url, region, publishDate, announcementType)


    # 浙江国际招投标有限公司 https://www.zmeetb.com/
    def CrawlPage_zmeetb(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/'  + str(page) + '.html'

        headers =  {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Cache-Control": "max-age=0",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "close",
            "DNT": '1',
            "Host": "www.zmeetb.com",
            "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "Windows",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }

        # 这个网站返回的是一个网页，所以需要进行网页解析
        # 这个网站如果使用render()函数，会遇到ssl证书问题，需要进一步研究chromium浏览器的证书问题
        #r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com')
        r = session.get(url = url, headers = headers, verify=False)

        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text)
            return False

        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a')
        for item in data:
            title = item.xpath('//p')[0].text
            url = item.attrs.get('href')
            region = '浙江国际招标'
            publishDate = item.xpath('//p')[1].text
            announcementType = typeParam['announcementType']

            self.write_information(title, url, region, publishDate, announcementType)

        
    # 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/
    def CrawlPage_nbmcbidding(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        if typeParam['announcementType'] == '采购公告':
            url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/"
        else:
            url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/"


        data = {}
        headers =  {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Host": "www.nbmcbidding.com",
            'Connection': 'keep-alive',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }

        r = session.get(url = url, headers = headers, json = data)

        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text)
            return False

        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li')
        for item in data:
            title = item.xpath('//a/div[2]')[0].text
            url = item.xpath('//a')[0].attrs.get('href')
            region = '宁波名诚招标'
            publishDate = item.xpath('//a/div[4]')[0].text
            announcementType = typeParam['announcementType']

            self.write_information(title, url, region, publishDate, announcementType)


    # 宁波市国际招标有限公司 http://www.nbbidding.com/
    def CrawlPage_nbbidding(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        if typeParam['announcementType'] == '采购公告':
            url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword"

        else:
            url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword"


        data = {}
        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Host": "www.nbbidding.com",
            'Connection': 'keep-alive',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }

        r = session.get(url = url, headers = headers, json = data)

        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text)
            return False

        data = json.loads(r.text)['data']
        total = data['page']['count']
        data = data['list']

        for item in data:
            id = item['id']
            if typeParam['announcementType'] == '采购公告':
                url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id)
            else:
                url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id) 
            title = item['title']
            region = '宁波国际招标'
            publishDate = item['addtime']
            announcementType = item['stage']
            self.write_information(title, url, region, publishDate, announcementType)

            print(publishDate, title, url)

    # 宁波市国资委属企业招标信息网
    def CrawlPage_gzw_ningbo(self, page):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page)

        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "DNT": '1',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }

        # 这个网站返回的是一个网页，所以需要进行网页解析
        r = session.get(url = url, headers = headers)
        r.html.render()

        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text)
            return False

        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li')
        for item in data:
            title = item.xpath('//a')[0].text
            url = item.xpath('//a')[0].attrs.get('href')
            region = '宁波市属国企'
            publishDate = item.xpath('//p')[0].text
            announcementType = '采购公告'
            self.write_information(title, url, region, publishDate, announcementType)


    # 宁波市中介超市网
    # 2024-03-29 更新
    def CrawlPage_zjcs_nbxzfw(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        # type 用于判别采购信息的类型
        session = HTMLSession()
        urllist = ['http://122.247.77.99:443/siteapi/api/Portal/GetBulletinInfoList']

        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "DNT": '1',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        payload = {
                   "page": page,
                   "pageSize": 10,
                   "center_id": "",
                   "bulletin_type_id": typeParam["announcementCode"]
               }

        for url in urllist:
            r = session.post(url = url, headers = headers, json = payload)

            if r.status_code != 200:
                if page == 1:
                    gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
                return False

            data = json.loads(r.text)['body']

            total = data['total']
            data = data['data']['bulletinInfoList']

            for item in data:
                articleId = item['auto_id']
                BulletinTypeId = typeParam["announcementCode"]
                url = 'http://122.247.77.99:443/gDetails?id=%s' % (articleId)
                title = item['bulletin_title']
                region = '宁波中介超市'
                publishDate = item['publish_date'].replace('T', ' ')
                announcementType = typeParam['announcementType']
                self.write_information(title, url, region, publishDate, announcementType)

                #print(publishDate, title, url)

    # 宁波阳光采购网
    def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):
        url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType=' + typeParam['announcementCode']
        
        wait_for = '.ant-pagination-item-ellipsis'
        page_element = '.anticon-right'
        try:
            r = Splash().post(url, wait_for, pages=pages, page_element=page_element)
        except Exception as e:
            print(e)
        
        results = json.loads(r.text)

        # 这个方法是实际爬取指定页面的信息。
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码：'+str(r.status_code), r.text)
            return False

        for i in range(1, pages + 1):
            data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li')
            if len(data) == 0:
                print('数据为空')
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e)
                return False

            for item in data:
                url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href') 
                title = item.xpath('//a/span[3]')[0].text
                region = '宁波阳光采购'
                publishDate = item.xpath('//div[2]')[0].text
                announcementType = typeParam['announcementType']
                print(title)
                self.write_information(title, url, region, publishDate, announcementType)


    # 浙江政府采购网
    def CrawlPage_zfcg_czt_zj(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        url = 'https://zfcg.czt.zj.gov.cn/portal/category'
        if typeParam['announcementCode'] == '110-420383':
            data = {
                "pageNo": page,
                "pageSize": 15,
                "categoryCode": typeParam['announcementCode'],
                "districtCode": ["339900"],
                "isProvince": True,
                "includeGovDistrict": "1",
                "_t": 1699104836000
            }
        else:
            data = {
                "pageNo": page,
                "pageSize": 15,
                "categoryCode": typeParam['announcementCode'],
                "isGov": True,
                "excludeDistrictPrefix": "90",
                "_t": 1699104836000
            }

        headers =  {
            "accept": "application/json, text/plain, */*",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            "content-type": "application/json;charset=UTF-8",
            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "x-requested-with": "XMLHttpRequest"
        }

        try:
            r = session.post(url = url, headers = headers, json = data)
        except Exception as e:
            print('10-------------------------', e)

        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text)
            return False

        data = json.loads(r.text)['result']['data']
        total = data['total']
        data = data['data']

        for item in data:
            publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000)
            pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c'
            detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId']
            announcementType = typeParam['announcementType']
            if announcementType == '采购意向':
                r = session.get(url = detailUrl, headers = headers)

                detailData = json.loads(r.text)['result']['data']
                if detailData == None:
                    break

                content = HTML(html='<xml>'+detailData['content']+'</xml>')
                region = item['districtName']
                for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'):
                    title = detailItem.xpath('//td[2]')[0].text
                    cgxqqk = detailItem.xpath('//td[3]')[0].text
                    ysje = detailItem.xpath('//td[4]')[0].text
                    yjcgsj = detailItem.xpath('//td[5]')[0].text
                    ly = detailData["title"]
                    
                    self.write_information(title, pageUrl, region, publishDate, announcementType)
                    self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly})
            else:
                title = item['title']
                region = item['districtName']
                self.write_information(title, pageUrl, region, publishDate, announcementType)

            #print(publishDate, url)


        return True