first commit

1 year ago · 1fb57e6877
35 changed files with 4828 additions and 0 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,3 @@
 # Default ignored files
 /shelf/
 /workspace.xml
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,8 @@
 <project version="4">
  <component name="Black">
    <option name="sdkName" value="Python 3.9 (PyGuoyan)" />
  </component>
  <component name="ProjectRootManager">
    <output url="file://$PROJECT_DIR$/out" />
  </component>
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/PyGuoyan.iml" filepath="$PROJECT_DIR$/PyGuoyan.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/redis-manager-config.xml
+++ b/.idea/redis-manager-config.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="PersistentConfig">
    <option name="langCode" value="en" />
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$" vcs="Git" />
  </component>
 </project>
--- a/PyGuoyan.iml
+++ b/PyGuoyan.iml
@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager" inherit-compiler-output="true">
    <exclude-output />
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="jdk" jdkName="Python 3.9 (PyGuoyan)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/SYgcg.py
+++ b/SYgcg.py
@ -0,0 +1,14 @@
 #!/usr/bin/python3
 from splash.gysplash import SBase
 import json
 class SYgcg(SBase):
    def open(self):
        return super().open('ygcg', pages=2, annoucement_type='政府采购')
 if __name__ == '__main__':
    test = SYgcg()
    r = test.open()
    results = json.loads(r.text)
    print(results)
--- a/pycache/crawler.cpython-310.pyc
+++ b/pycache/crawler.cpython-310.pyc
--- a/pycache/gymailer.cpython-310.pyc
+++ b/pycache/gymailer.cpython-310.pyc
--- a/pycache/properties.cpython-310.pyc
+++ b/pycache/properties.cpython-310.pyc
--- a/cert.crt
+++ b/cert.crt
@ -0,0 +1,21 @@
 -----BEGIN CERTIFICATE-----
 MIIDdTCCAl2gAwIBAgILBAAAAAABFUtaw5QwDQYJKoZIhvcNAQEFBQAwVzELMAkG
 A1UEBhMCQkUxGTAXBgNVBAoTEEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jv
 b3QgQ0ExGzAZBgNVBAMTEkdsb2JhbFNpZ24gUm9vdCBDQTAeFw05ODA5MDExMjAw
 MDBaFw0yODAxMjgxMjAwMDBaMFcxCzAJBgNVBAYTAkJFMRkwFwYDVQQKExBHbG9i
 YWxTaWduIG52LXNhMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJHbG9iYWxT
 aWduIFJvb3QgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDaDuaZ
 jc6j40+Kfvvxi4Mla+pIH/EqsLmVEQS98GPR4mdmzxzdzxtIK+6NiY6arymAZavp
 xy0Sy6scTHAHoT0KMM0VjU/43dSMUBUc71DuxC73/OlS8pF94G3VNTCOXkNz8kHp
 1Wrjsok6Vjk4bwY8iGlbKk3Fp1S4bInMm/k8yuX9ifUSPJJ4ltbcdG6TRGHRjcdG
 snUOhugZitVtbNV4FpWi6cgKOOvyJBNPc1STE4U6G7weNLWLBYy5d4ux2x8gkasJ
 U26Qzns3dLlwR5EiUWMWea6xrkEmCMgZK9FGqkjWZCrXgzT/LCrBbBlDSgeF59N8
 9iFo7+ryUp9/k5DPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8E
 BTADAQH/MB0GA1UdDgQWBBRge2YaRQ2XyolQL30EzTSo//z9SzANBgkqhkiG9w0B
 AQUFAAOCAQEA1nPnfE920I2/7LqivjTFKDK1fPxsnCwrvQmeU79rXqoRSLblCKOz
 yj1hTdNGCbM+w6DjY1Ub8rrvrTnhQ7k4o+YviiY776BQVvnGCv04zcQLcFGUl5gE
 38NflNUVyRRBnMRddWQVDf9VMOyGj/8N7yy5Y0b2qvzfvGn9LhJIZJrglfCm7ymP
 AbEVtQwdpf5pLGkkeB6zpxxxYu7KyJesF12KwvhHhm4qxFYxldBniYUr+WymXUad
 DKqC5JlR3XC321Y9YeRq4VzW9v493kHMB65jUr9TU/Qr6cf9tveCX4XSQRjbgbME
 HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A==
 -----END CERTIFICATE-----
--- a/config.ini
+++ b/config.ini
@ -0,0 +1,5 @@
 [database]
 host = localhost
 database = guoyantest
 user = root
 password = Guoyan83086775
--- a/crawler.py
+++ b/crawler.py
@ -0,0 +1,714 @@
 #!/usr/bin/python3
 """
 ===========================================================================================
 这是一个用于爬取采购信息的模块
 要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表
 ===========================================================================================
 class Crawler:
    def __init__(self, connect):
    def generate_id(self):
    def write_log_information(self, data_id, catalog_name):
    def CrawlPage_gzw_ningbo(self, page):           # 宁波国资委市属国企招标投标信息
    def CrawlPage_zjcs_nbxzfw(self, type, page):    # 宁波市中介超市
    def CrawlPage_ygcg_nbcqjy_org(self, page):      # 宁波市阳光采购
    def CrawlPage_zfcg_czt_zj(self, page):          # 浙江政府采购网
    def CrawlPage_cbbidding(self, page):            # 宁波中基国际招标有限公司
    def CrawlPage_zmeetb(self, page):               # 浙江国际招标有限公司
    def CrawlPage_nbbidding(self, page):            # 宁波国际招标有限公司
 ============================================================================================
 """
 import datetime
 import hashlib
 import pymysql
 import json
 import random
 from requests_html import HTMLSession
 from requests_html import HTML, UserAgent
 import gymailer
 import time
 '''
 ============================================================
 这个类用来封装splash服务
 其中:
   self.splash_ip 参数是splash服务的ip
 ============================================================
 '''
 class Splash:
    def __init__(self):
        self.splash_ip = '127.0.0.1'
    '''
    ============================================================
       wait_for参数用来制定需要等待的元素，只有该元素渲染完成，程序才能染回，否则将等待200秒。wait_for 参数采购选择器的方式，如
       如制定元素id, 采用“#app"形式，如制定元素class, 采用 '.class-name'形式。
    ============================================================
    '''
    def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}):
        lua_scripts = """
 function wait_for_element(splash, css, maxwait)
  -- Wait until a selector matches an element
  -- in the page. Return an error if waited more
  -- than maxwait seconds.
  if maxwait == nil then
      maxwait = 10
  end
  return splash:wait_for_resume(string.format([[
    function main(splash) {
      var selector = '%s';
      var maxwait = %s;
      var end = Date.now() + maxwait*1000;
      function check() {
        if(document.querySelector(selector)) {
          splash.resume('Element found');
        } else if(Date.now() >= end) {
          var err = 'Timeout waiting for element';
          splash.error(err + " " + selector);
        } else {
          setTimeout(check, 200);
        }
      }
      check();
    }
  ]], css, maxwait))
 end
 function main(splash, args)
  pages = """ + str(pages) + """
  page_element = '""" + page_element + """'
  wait_for = '""" + wait_for + """'
  splash:go('""" + url + """')
  wait_for_element(splash, wait_for)
  wait_for_element(splash, page_element)
  -- 将第一页的结果加入返回结果集中
  results = {splash.html()}
  if pages == 1 then
    return results
  else
    -- 执行翻页动作
    -- 先页面上的翻页元件（element），然后发送点击事件（click()）翻页
    for i = 2, pages do
      -- js 中是javascript脚本，用于获取翻页的元件，并发送click事件
      js = string.format("document.querySelector('%s').click();", page_element)
      -- 执行翻页脚本
      splash:runjs(js)
      -- 等待页面加载完成
      wait_for_element(splash, wait_for)
      wait_for_element(splash, page_element)
      -- 这个地方看来必须加上延时，否则页面加载不完全，可能还没有完成页面更新
      assert(splash:wait(5))
      -- 将页面加入返回结果集中
      table.insert(results, splash.html())
    end
    return results
  end
 end
        """
        splash_url = 'http://' + self.splash_ip + ':8050/execute'
        data = json.dumps({'lua_source':lua_scripts})
        r = HTMLSession().post(splash_url, headers=headers, data=data)
        return r
 class Crawler:
    def __init__(self, connect):
        self.connect = connect
    def generate_id(self):
        # 用于生成一个32位的ID号
        current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000))
        md5_hash = hashlib.md5()
        md5_hash.update(current_time.encode('utf-8'))
        return md5_hash.hexdigest()
    def write_log_information(self, data_id, catalog_name, log_type='采购公告'):
        # 添加了一条信息，需要同步更新其他相关信息, 包含对话框信息和日志信息两项
        with self.connect.cursor() as cursor:
            affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type))
            if affected_row == 0:
                return False
            result = cursor.fetchall()
            catalog_id = result[0][0]
            catalogdata_id = self.generate_id()
            readlog_id = self.generate_id()
            affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'")
            if affected_row == 0:
                return False
            result = cursor.fetchall()
            staff_id = result[0][0]
            add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            affected_row = cursor.execute(
                    'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)',
                    (catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0))
            cursor.execute(
                    'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)',
                    (readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id))
        return True
    def write_information(self, title, url, region, publishTime, announcementType):
        # 用于将一条信息写入数据库中
        with self.connect.cursor() as cursor:
            cggg_id = self.generate_id()
            try:
                title = title.replace("'", "\\\'")
                affected_rows = cursor.execute(
                            'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)',
                            (cggg_id, title, url, region, publishTime, announcementType))
            except pymysql.err.IntegrityError:
                print('信息重复')
                self.connect.rollback()
                return False
            else:
                if self.write_log_information(cggg_id, announcementType):
                    self.connect.commit()
                else:
                    print('添加采购信息失败')
                    self.connect.rollback()
                    return False
        return True
    def write_information_cgyx(self, cgyx):
        # 用于将一条信息写入数据库中
        with self.connect.cursor() as cursor:
            cgyx_id = self.generate_id()
            cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'")
            strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')'
            try:
                affected_rows = cursor.execute(strSql)
            except pymysql.err.IntegrityError:
                print('信息重复')
                #self.connect.rollback()
                return False
            else:
                if self.write_log_information(cgyx_id, '采购意向'):
                    self.connect.commit()
                else:
                    print('添加采购信息失败')
                    self.connect.rollback()
                    return False
        return True
    def Check(self):
        with self.connect.cursor() as cursor:
            affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);")
            if affected_row == 0:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常，请检查！')
                return False
            else:
                return True
    def Crawl(self):
        # 这个方法是实际完成爬取工作的总入口。
        # 爬取浙江政采网的信息
        print('开始获取浙江政采网的信息\n')
        # 定义要传递进去的关于公告信息类型的数据结构
        infoType = [
            {"announcementCode": "110-175885", "announcementType":"采购意向"},
            {"announcementCode": "110-978863", "announcementType":"采购公告"},
            {"announcementCode": "110-943756", "announcementType":"更正公告"},
            {"announcementCode": "110-420383", "announcementType":"非政府采购公告"},
            {"announcementCode": "110-900461", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 11):
                try:
                    self.CrawlPage_zfcg_czt_zj(page, typeParam)
                except Exception as e:
                    print('3--------------------------------', e)
        # 爬取宁波市阳光采购网的信息 
        print('开始获取宁波市阳光采购网的信息\n')
        infoType = [
            {"announcementCode": "21", "announcementType":"采购公告"},
            {"announcementCode": "23", "announcementType":"更正公告"},
            {"announcementCode": "22", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            try:
                self.CrawlPage_ygcg_nbcqjy_org(2, typeParam)
            except Exception as e:
                print('4--------------------------------', e)
        # 爬取宁波市中介超市网的信息 
        print('开始获取宁波市中介超市网的信息\n')
        infoType = [
            {"announcementCode": '1', "announcementType":"项目需求公告"},
            {"announcementCode": '2', "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 6):
                try:
                    self.CrawlPage_zjcs_nbxzfw(page, typeParam)
                except Exception as e:
                    print('5------------------------------', e)
        # 爬取宁波市国资委市属企业采购信息 
        print('开始获取宁波市国资委市属企业招投标网的信息\n')
        for page in range(1, 5):
            try:
                self.CrawlPage_gzw_ningbo(page)
            except Exception as e:
                print('6------------------------------', e)
        # 爬取宁波中基国际招标网的信息 
        print('开始获取宁波中基国际招标网的信息\n')
        infoType = [
            {"announcementCode": "22", "announcementType":"采购公告"},
            {"announcementCode": "23", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 6):
                try:
                    self.CrawlPage_cbbidding(page, typeParam)
                except Exception as e:
                    print('7--------------------------------', e)
        # 爬取浙江国际招标网的信息 
        print('开始获取浙江国际招标网的信息\n')
        infoType = [
            {"announcementCode": "Zbgg", "announcementType":"采购公告"},
            {"announcementCode": "Gzgg", "announcementType":"更正公告"},
            {"announcementCode": "jggg", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 5):
                try:
                    self.CrawlPage_zmeetb(page, typeParam)
                except Exception as e:
                    print('8----------------------------', e)
        # 爬取宁波市国际招标有限公司网站
        print('开始获取宁波国际招标网的信息\n')
        # 定义要传递进去的关于公告信息类型的数据结构
        infoType = [
            {"announcementCode": "1", "announcementType":"采购公告"},
            {"announcementCode": "1", "announcementType":"结果公告"},
            {"announcementCode": "2", "announcementType":"采购公告"},
            {"announcementCode": "2", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 5):
                try:
                    self.CrawlPage_nbbidding(page, typeParam)
                except Exception as e:
                    print('9--------------------------------', e)
        # 爬取宁波名诚招标代理有限公司网站
        print('开始获取宁波名城招标的信息\n')
        # 定义要传递进去的关于公告信息类型的数据结构
        infoType = [
            {"announcementCode": "99", "announcementType":"采购公告"},
            {"announcementCode": "88", "announcementType":"结果公告"}
        ]
        for typeParam in infoType:
            for page in range(1, 2):
                try:
                    self.CrawlPage_nbmcbidding(page, typeParam)
                except Exception as e:
                    print('10--------------------------------', e)
    # 宁波中基国际招标有限公司    https://www.cbbidding.com/
    def CrawlPage_cbbidding(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        session.DEFAULT_RETRIES = 5
        url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page)
        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "DNT": '1',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        # 这个网站返回的是一个网页，所以需要进行网页解析
        r = session.get(url = url, headers = headers)
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text)
            return False
        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li')
        for item in data:
            title = item.xpath('//a')[0].text
            url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href')
            region = '中基招标'
            publishDate = item.xpath('//div')[0].text
            try:
                publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
            except Exception as e:
                publishDate = publishDate.replace('.', '-')
                publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
            print(url, title)
            announcementType = typeParam['announcementType']
            #print(title, url, region, publishDate, announcementType)
            self.write_information(title, url, region, publishDate, announcementType)
    # 浙江国际招投标有限公司 https://www.zmeetb.com/
    def CrawlPage_zmeetb(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/'  + str(page) + '.html'
        headers =  {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate, br",
            "Cache-Control": "max-age=0",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "close",
            "DNT": '1',
            "Host": "www.zmeetb.com",
            "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "Windows",
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        # 这个网站返回的是一个网页，所以需要进行网页解析
        # 这个网站如果使用render()函数，会遇到ssl证书问题，需要进一步研究chromium浏览器的证书问题
        #r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com')
        r = session.get(url = url, headers = headers, verify=False)
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text)
            return False
        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a')
        for item in data:
            title = item.xpath('//p')[0].text
            url = item.attrs.get('href')
            region = '浙江国际招标'
            publishDate = item.xpath('//p')[1].text
            announcementType = typeParam['announcementType']
            self.write_information(title, url, region, publishDate, announcementType)
    # 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/
    def CrawlPage_nbmcbidding(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        if typeParam['announcementType'] == '采购公告':
            url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/"
        else:
            url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/"
        data = {}
        headers =  {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Host": "www.nbmcbidding.com",
            'Connection': 'keep-alive',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        r = session.get(url = url, headers = headers, json = data)
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text)
            return False
        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li')
        for item in data:
            title = item.xpath('//a/div[2]')[0].text
            url = item.xpath('//a')[0].attrs.get('href')
            region = '宁波名诚招标'
            publishDate = item.xpath('//a/div[4]')[0].text
            announcementType = typeParam['announcementType']
            self.write_information(title, url, region, publishDate, announcementType)
    # 宁波市国际招标有限公司 http://www.nbbidding.com/
    def CrawlPage_nbbidding(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        if typeParam['announcementType'] == '采购公告':
            url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword"
        else:
            url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword"
        data = {}
        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Host": "www.nbbidding.com",
            'Connection': 'keep-alive',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        r = session.get(url = url, headers = headers, json = data)
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text)
            return False
        data = json.loads(r.text)['data']
        total = data['page']['count']
        data = data['list']
        for item in data:
            id = item['id']
            if typeParam['announcementType'] == '采购公告':
                url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id)
            else:
                url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id) 
            title = item['title']
            region = '宁波国际招标'
            publishDate = item['addtime']
            announcementType = item['stage']
            self.write_information(title, url, region, publishDate, announcementType)
            print(publishDate, title, url)
    # 宁波市国资委属企业招标信息网
    def CrawlPage_gzw_ningbo(self, page):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page)
        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "DNT": '1',
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        # 这个网站返回的是一个网页，所以需要进行网页解析
        r = session.get(url = url, headers = headers)
        r.html.render()
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text)
            return False
        # 注意：xpath 函数返回的是list对象, 对象的元素是element
        data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li')
        for item in data:
            title = item.xpath('//a')[0].text
            url = item.xpath('//a')[0].attrs.get('href')
            region = '宁波市属国企'
            publishDate = item.xpath('//p')[0].text
            announcementType = '采购公告'
            self.write_information(title, url, region, publishDate, announcementType)
    # 宁波市中介超市网
    def CrawlPage_zjcs_nbxzfw(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        # type 用于判别采购信息的类型
        session = HTMLSession()
        urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15']
        headers =  {
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "DNT": '1',
            "Host": "ygcg.nbcqjy.org",
            "Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type),
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
        }
        for url in urllist:
            r = session.get(url = url, headers = headers)
            if r.status_code != 200:
                if page == 1:
                    gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
                return False
            data = json.loads(r.text)['data']
            total = data['total']
            data = data['rows']
            for item in data:
                articleId = item['AutoId']
                BulletinTypeId = item['BulletinTypeId']
                url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId) 
                title = item['BulletinTitle']
                region = '宁波中介超市'
                publishDate = item['PublishDate']
                announcementType = typeParam['announcementType']
                self.write_information(title, url, region, publishDate, announcementType)
                #print(publishDate, url)
    # 宁波阳光采购网
    def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):
        url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType=' + typeParam['announcementCode']
        wait_for = '.ant-pagination-item-ellipsis'
        page_element = '.anticon-right'
        try:
            r = Splash().post(url, wait_for, pages=pages, page_element=page_element)
        except Exception as e:
            print(e)
        results = json.loads(r.text)
        # 这个方法是实际爬取指定页面的信息。
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码：'+str(r.status_code), r.text)
            return False
        for i in range(1, pages + 1):
            data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li')
            if len(data) == 0:
                print('数据为空')
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e)
                return False
            for item in data:
                url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href') 
                title = item.xpath('//a/span[3]')[0].text
                region = '宁波阳光采购'
                publishDate = item.xpath('//div[2]')[0].text
                announcementType = typeParam['announcementType']
                print(title)
                self.write_information(title, url, region, publishDate, announcementType)
    # 浙江政府采购网
    def CrawlPage_zfcg_czt_zj(self, page, typeParam):
        # 这个方法是实际爬取指定页面的信息。
        session = HTMLSession()
        url = 'https://zfcg.czt.zj.gov.cn/portal/category'
        if typeParam['announcementCode'] == '110-420383':
            data = {
                "pageNo": page,
                "pageSize": 15,
                "categoryCode": typeParam['announcementCode'],
                "districtCode": ["339900"],
                "isProvince": True,
                "includeGovDistrict": "1",
                "_t": 1699104836000
            }
        else:
            data = {
                "pageNo": page,
                "pageSize": 15,
                "categoryCode": typeParam['announcementCode'],
                "isGov": True,
                "excludeDistrictPrefix": "90",
                "_t": 1699104836000
            }
        headers =  {
            "accept": "application/json, text/plain, */*",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            "content-type": "application/json;charset=UTF-8",
            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "x-requested-with": "XMLHttpRequest"
        }
        try:
            r = session.post(url = url, headers = headers, json = data)
        except Exception as e:
            print('10-------------------------', e)
        if r.status_code != 200:
            if page == 1:
                gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text)
            return False
        data = json.loads(r.text)['result']['data']
        total = data['total']
        data = data['data']
        for item in data:
            publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000)
            pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c'
            detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId']
            announcementType = typeParam['announcementType']
            if announcementType == '采购意向':
                r = session.get(url = detailUrl, headers = headers)
                detailData = json.loads(r.text)['result']['data']
                if detailData == None:
                    break
                content = HTML(html='<xml>'+detailData['content']+'</xml>')
                region = item['districtName']
                for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'):
                    title = detailItem.xpath('//td[2]')[0].text
                    cgxqqk = detailItem.xpath('//td[3]')[0].text
                    ysje = detailItem.xpath('//td[4]')[0].text
                    yjcgsj = detailItem.xpath('//td[5]')[0].text
                    ly = detailData["title"]
                    self.write_information(title, pageUrl, region, publishDate, announcementType)
                    self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly})
            else:
                title = item['title']
                region = item['districtName']
                self.write_information(title, pageUrl, region, publishDate, announcementType)
            #print(publishDate, url)
        return True
--- a/dbsearch.py
+++ b/dbsearch.py
@ -0,0 +1,137 @@
 #!/usr/bin/python3
 import pymysql
 from properties import Properties
 import sys, getopt
 class DbSearch:
    # 本类用于提供各类数据库信息搜索服务
    def __init__(self, connect):
        self.connect = connect
    def GetTableList(self, database):
        # 查询某个库的数据表的列表
        cursorTable = self.connect.cursor()
        cursorTable.execute("SELECT table_name FROM INFORMATION_SCHEMA.TABLES where table_schema = '" + database + "'");
        return cursorTable.fetchall()
    def GetColumnList(self, tableName):
        # 查询某张表的数据字段列表
        cursorColumn = self.connect.cursor()
        cursorColumn.execute("SELECT column_name,data_type FROM INFORMATION_SCHEMA.COLUMNS where table_schema='" + database + "' AND table_name='" + 
            tableName + "'");
        return cursorColumn.fetchall()
    def SearchTableByColumnName(self, columnName, database):
        # 查询包含包含searchText的库表
        tableList = self.GetTableList(database)
        findList = list()
        for table in tableList:
            columnList = self.GetColumnList(table[0])
            for column in columnList:
                if column[0].find(columnName) != -1:
                    findList.append(table[0])
        return findList
    def SearchTableByText(self, searchText, database):
        # 查找包含searchText字符串的表，并显示相应的表记录
        tableList = self.GetTableList(database)
        if len(tableList) == 0:
            return False
        found = 0
        findList = list()
        for table in tableList:
            strSql = "SELECT '" + table[0] + "' as table_name, t.* "
            strSql = strSql + "  FROM " + database + "." + table[0] + " as t where " + "("
            columnList = self.GetColumnList(table[0])
            i = 0
            count = len(columnList)
            for column in columnList:
                # 如果字段数据类型为非文本型，跳过
                if not column[1] in ('varchar', 'char', 'text'):
                    continue
                i += 1
                if i > 1:
                    strSql += " or "
                strSql += column[0] + " like '%" + searchText + "%' "
            strSql += ")"
            cursorColumn = self.connect.cursor()
            try:
                cursorColumn.execute(strSql)
            except Exception as e:
                print('2----------------------------', database, strSql)
                print("-----错误信息：-----\n", e)
                return False
            result = cursorColumn.fetchall()
            if len(result) > 0:
                findList.append(table[0])
                print("==========================================================================")
                print(table[0], result, strSql)
        return findList
 if __name__ == '__main__':
    print(
 """
 ============================================================
 |这是数据库全文检索工具，包含两个参数                      |
 ============================================================
 """)
    # 设置运行环境。如果当前是测试环境，则将is_test设置为true
    is_test = False
    if is_test:
        file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties"
        database = 'guoyantest'
    else:
        file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties"
        database = 'guoyan'
    # 打开jdbc.properties文件，获取数据库的配置信息
    props = Properties(file_path)
    host = 'localhost'
    user = props.get('jdbc.username')
    password = props.get('jdbc.password')
    # 打开数据连接
    db = pymysql.connect(host = host, user = user, password = password, database = database)
    # 获取命令行参数
    keyword = ''
    searchType =''
    keyword = ''
    searchType = ''
    try:
        opts, args = getopt.getopt(sys.argv[1:],"hT:k:",["keyword=","searchType="])
    except getopt.GetoptError:
        print(sys.argv[0] + ' -k <keyword> -T <searchType>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('3--------------------', 'test.py -k <keyword> -T <searchType>')
            sys.exit()
        elif opt in ("-k", "--keyword"):
            keyword = arg
        elif opt in ("-T", "--searchType"):
            searchType = arg
    dbSearch = DbSearch(db)
    if searchType == '0':
        print('正在根据您输入的关键词查找表.....................')
        print('found tables: ', dbSearch.SearchTableByText(keyword, database))
    elif searchType == '1':
        print('正在根据您输入的列名查找表.....................')
        print('found tables: ', dbSearch.SearchTableByColumnName(keyword, database))
--- a/gycrawler.py
+++ b/gycrawler.py
@ -0,0 +1,59 @@
 #!/usr/bin/python3
 """这是爬虫的主程序主程序
 作者：陈进钱
 日期：2023/11/03
 """
 import pymysql
 import datetime
 import time
 from apscheduler.schedulers.blocking import BlockingScheduler
 from properties import Properties
 from crawler import Crawler
 print(
        """采购信息采集器 v1.0
 ===================================================================================
    这个程序用于获取各大招投标网站的采购信息
    version: 1.0
    作者：陈进钱
    日期：2023-11-04
 ===================================================================================""")
 # 设置运行环境。如果当前是测试环境，则将is_test设置为true
 is_test = False
 if is_test:
    file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties"
    database = 'guoyantest'
 else:
    file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties"
    database = 'guoyan'
 # 打开jdbc.properties文件，获取数据库的配置信息
 props = Properties(file_path)
 host = 'localhost'
 user = props.get('jdbc.username')
 password = props.get('jdbc.password')
 # 打开数据连接
 connect = pymysql.connect(host = host, user = user, password = password, database = database)
 # 获取采购信息，并填写到数据库中
 crawler = Crawler(connect)
 # 启动自动爬取任务
 def crawl_check_func():
    crawler.Check()
 # 启动自动爬取任务
 def crawl_job_func():
    crawler.Crawl()
 sched = BlockingScheduler()
 sched.add_job(crawl_job_func, 'interval', hours=3, jitter=120, max_instances=4)
 sched.add_job(crawl_check_func, 'interval', days=1, jitter=120, max_instances=4)
 sched.start()
 # 关闭数据库连接
 connect.close()
--- a/gymailer.py
+++ b/gymailer.py
@ -0,0 +1,44 @@
 import smtplib
 from email.mime.text import MIMEText
 from email.header import Header
 def SendMail(sender, receiver, subject, message):
    # 发送邮件服务器
    smtp_server = 'smtp.126.com'
    # 发送邮件服务器端口
    smtp_port = 465
    # 邮件对象
    msg = MIMEText(message, 'plain', 'utf-8')
    msg['From'] = Header(sender, 'utf-8')
    msg['To'] = Header(receiver, 'utf-8')
    msg['Subject'] = Header(subject, 'utf-8')
    # SMTP对象
    smtpObj = smtplib.SMTP_SSL(smtp_server, smtp_port)
    # 登录SMTP服务器
    smtpObj.login(sender, 'ERXYFJRLKPTTDXWH')
    # 发送邮件
    smtpObj.sendmail(from_addr=sender,to_addrs=[receiver],msg=msg.as_string())
    # 关闭SMTP连接
    smtpObj.quit()
 if __name__ == '__main__':
    # 发件人邮箱
    sender = 'jinqian_chen@126.com'
    # 收件人邮箱
    receiver = 'jinqian.chen@srit.com.cn'
    # 邮件主题
    subject = 'Python3发送邮件示例, new'
    # 邮件正文
    message = '这是一封Python3发送的邮件'
    SendMail(sender, receiver, subject, message)
--- a/jdbc.properties
+++ b/jdbc.properties
@ -0,0 +1,46 @@
 #Db2
 #hibernate.dialect=org.hibernate.dialect.DB2Dialect
 #jdbc.driverClassName=com.ibm.db2.jcc.DB2Driver
 #jdbc.url=jdbc:db2://localhost:50000/eaching
 #Oracle
 #hibernate.dialect=org.hibernate.dialect.Oracle10gDialect
 #jdbc.driverClassName=oracle.jdbc.driver.OracleDriver
 #jdbc.url=jdbc:oracle:thin:@47.99.208.214:1521:orcl
 #jdbc.url=jdbc:oracle:thin:@118.190.161.36:1521:orcl
 #SqlServer
 #hibernate.dialect=org.hibernate.dialect.SQLServerDialect
 #jdbc.driverClassName=net.sourceforge.jtds.jdbc.Driver
 #jdbc.url=jdbc:jtds:sqlserver://localhost:1433/guanwaimatou;SelectMethod=Cursor
 #MySql
 hibernate.dialect=org.hibernate.dialect.MySQLDialect
 jdbc.driverClassName=com.mysql.jdbc.Driver
 jdbc.url=jdbc:mysql://116.62.210.190:3306/guoyantest?autoReconnect=true&useUnicode=true&characterEncoding=UTF8&mysqlEncoding=utf8&zeroDateTimeBehavior=convertToNull
 jdbc.username=root
 jdbc.password=Guoyan83086775
 jdbc.maxConn=20
 jdbc.minConn=5
 jdbc.activeTime=900000
 jdbc.alias=eaching
 jdbc.keepingSleepTime=30000
 jdbc.maxConnectionLifetime=60000
 jdbc.multiSource=false
 hibernate.cache.use_second_level_cache=true
 hibernate.show_sql=false
 hibernate.generate_statistics=false
 hibernate.cache.provider_class=org.hibernate.cache.EhCacheProvider
 #hibernate.cache.provider_class=net.oschina.j2cache.hibernate3.J2CacheProvider
 hibernate.cache.use_minimal_puts=true
 hibernate.cache.use_structured_entries=true
 hibernate.cache.use_query_cache=true
 hibernate.use_sql_comments=trues
 hibernate.order_updates=true
 hibernate.format_sql=false
 hbm2ddl.auto=create
--- a/main.py
+++ b/main.py
@ -0,0 +1,75 @@
 #!/usr/bin/python3
 """这是爬虫的主程序主程序
 作者：陈进钱
 日期：2023/11/03
 """
 import pymysql
 import datetime
 import time
 from apscheduler.schedulers.blocking import BlockingScheduler
 from properties import Properties
 from crawler import Crawler
 import sys
 import os
 print(
        """采购信息采集器 v1.0
 ===================================================================================
    这个程序用于获取各大招投标网站的采购信息
    version: 1.0
    作者：陈进钱
    日期：2023-11-04
 ===================================================================================""")
 # 设置运行环境。如果当前是测试环境，则将is_test设置为true
 is_test = True
 if is_test:
    root = "/opt/eresource_test/webapp/WEB-INF/classes/prod/"
 else:
    root = "/opt/eresource/webapp/WEB-INF/classes/prod/"
 if os.path.exists(root):
    file_path = root + "jdbc.properties"
 else:
    file_path = "jdbc.properties"
 if sys.platform == 'win32':
    host = '116.62.210.190'
    user = 'root'
    password = 'Guoyan83086775'
    if is_test:
        database = 'guoyantest'
    else:
        database = 'guoyan'
 else:
    if is_test:
        database = 'guoyantest'
    else:
        database = 'guoyan'
    # 打开jdbc.properties文件，获取数据库的配置信息
    props = Properties(file_path)
    host = '116.62.210.190'
    user = props.get('jdbc.username')
    password = props.get('jdbc.password')
 # 打开数据连接
 connect = pymysql.connect(host = host, user = user, password = password, database = database)
 # 获取采购信息，并填写到数据库中
 crawler = Crawler(connect)
 crawler.Crawl()
 #crawler.CrawlPage_ygcg_nbcqjy_org(1, {"announcementCode": "21", "announcementType":"采购公告"})
 #print(crawler.Check())
 # 启动自动爬取任务
 #def crawl_job_func():
 #    crawler.Crawl()
 #sched = BlockingScheduler()
 #sched.add_job(crawl_job_func, 'interval', hours=1, jitter=120)
 #sched.start()
 # 关闭数据库连接
 connect.close()
--- a/myrec.db
+++ b/myrec.db
--- a/properties.py
+++ b/properties.py
@ -0,0 +1,72 @@
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
 import re
 import os
 import tempfile
 class Properties:
    def __init__(self, file_name):
        # 如果配置文件不存在，取本地文件
        if not os.path.exists(file_name):
            file_name = 'jdbc.properties'
        self.file_name = file_name
        self.properties = {}
        try:
            fopen = open(self.file_name, 'r')
            for line in fopen:
                line = line.strip()
                if line.find('=') > 0 and not line.startswith('#'):
                    strs = line.split('=')
                    self.properties[strs[0].strip()] = strs[1].strip()
        except Exception as e:
            raise e
        else:
            fopen.close()
    def has_key(self, key):
        return key in self.properties
    def get(self, key, default_value=''):
        if key in self.properties:
            return self.properties[key]
        return default_value
    def put(self, key, value):
        self.properties[key] = value
        replace_property(self.file_name, key + '=.*', key + '=' + value, True)
 def replace_property(file_name, from_regex, to_str, append_on_not_exists=True):
    tmpfile = tempfile.TemporaryFile()
    if os.path.exists(file_name):
        r_open = open(file_name, 'r')
        pattern = re.compile(r'' + from_regex)
        found = None
        for line in r_open:
            if pattern.search(line) and not line.strip().startswith('#'):
                found = True
                line = re.sub(from_regex, to_str, line)
            tmpfile.write(line.encode())
        if not found and append_on_not_exists:
            tmpfile.write(('\n' + to_str).encode())
        r_open.close()
        tmpfile.seek(0)
        content = tmpfile.read()
        if os.path.exists(file_name):
            os.remove(file_name)
        w_open = open(file_name, 'wb')
        w_open.write(content)
        w_open.close()
        tmpfile.close()
    else:
        print ("file %s not found" % file_name)
--- a/splash/SYgcg.py
+++ b/splash/SYgcg.py
@ -0,0 +1,14 @@
 #!/usr/bin/python3
 from splash.gysplash import SBase
 import json
 class SYgcg(SBase):
    def open(self):
        return super().open('ygcg', pages=2, annoucement_type='政府采购')
 if __name__ == '__main__':
    test = SYgcg()
    r = test.open()
    results = json.loads(r.text)
    print(results)
--- a/splash/init.py
+++ b/splash/init.py
--- a/splash/pycache/SYgcg.cpython-310.pyc
+++ b/splash/pycache/SYgcg.cpython-310.pyc
--- a/splash/pycache/init.cpython-310.pyc
+++ b/splash/pycache/init.cpython-310.pyc
--- a/splash/pycache/gysplash.cpython-310.pyc
+++ b/splash/pycache/gysplash.cpython-310.pyc
--- a/splash/config/splash.cnf
+++ b/splash/config/splash.cnf
@ -0,0 +1,7 @@
 #This is splash config file.
 [splash service settings]
 server = localhost
 port = 8050
--- a/splash/config/splash.json
+++ b/splash/config/splash.json
@ -0,0 +1,20 @@
 {
 	"description": "This is splash config file.", 
 	"server": "127.0.0.1", 
 	"port": "8050",
 	"class":{
 		"SYgcg":{
 			"url":"https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType={{  $noticeType  }}",
 			"_comment":"http://www.baidu.com",
 			"param":{
 				"noticeType":"21"
 			},
 			"wait_for":".ant-list-items",
 			"page_element":".anticon-right",
 			"headers":{
 				"content-type":"application/json",
 				"Accept-Language": "zh-CN,zh;q=0.9,en;q=0"
 			}
 		}
 	}
 }
--- a/splash/configscripts/main.lua
+++ b/splash/configscripts/main.lua
@ -0,0 +1 @@
 This is lua script file
--- a/splash/gysplash.py
+++ b/splash/gysplash.py
@ -0,0 +1,122 @@
 #!/usr/bin/python3
 '''===================================================================
 这个模块用于对 splash 服务进行封装，方便在 python 中使用。
 版本：1.0
 作者：陈进钱
 日期：2023-12-18
 ==================================================================='''
 import os
 import re
 import json
 import codecs
 import configparser
 from requests_html import HTMLSession
 from requests_html import HTML
 config = configparser.ConfigParser()
 # splash 基类
 class SBase:
    def __init__(self):
        self.__lua_script = ''
        self.config = {}
        # 创建 ConfigParser 对象
        self.root = os.path.dirname(os.path.abspath(__file__))
        # 自动创建配置文件
        dir = self.root + '/config'
        if not os.path.exists(dir):
            os.makedirs(dir)
        file_path = self.root + '/config/splash.json'
        if os.path.exists(file_path):
            file = codecs.open(file_path, 'r', 'utf-8')
            content = file.read()
            self.config = json.loads(content)
            file.close()
        else:
            self.config['description'] = 'This is splash config file.'
            self.config['server'] = 'localhost'
            self.config['port'] = '8050'
            content = json.dumps(self.config)
            with codecs.open(file_path, 'w', 'utf-8') as file:
                file.write(content)
        # 自动创建空的脚本文件
        dir = self.root + '/scripts'
        if not os.path.exists(dir):
            os.makedirs(dir)
        # 这个代码要更新为一个通用代码
        file_path = dir + '/main.lua'
        if os.path.exists(file_path):
            file = codecs.open(file_path, 'r', 'utf-8')
            self.__lua_script = file.read()
            file.close()
        else:
            with codecs.open(file_path, 'w', 'utf-8') as file:
                self.__lua_script = 'This is lua script file'
                file.write(self.__lua_script)
    def script(self):
        return self.__lua_script;
    def class_name(self):
        return type(self).__name__;
    def replace(self, source, param, value):
        return re.sub('{{[\s]*\$' + param + '[\s]*}}', value, source)
    # 向lua脚本传递参数变量 
    def set_params_for_lua(self, scripts, params):
        for param in params:
            scripts = self.replace(scripts, param, params[param])
        return scripts
    '''
    --------------------------------------------------------------------------------------
    本函数用于打开指定的网址。具体的网址、参数、等待就绪的网页元件、等待就绪的翻页元件、
    headers 等参数默认为空。这些参数的任意一个为空时，则从配置文件中的相关类名项下提取。
    本函数会调用 lua 主脚本文件，执行页面解析的lua脚本文件。该文件名称通过参数 parser 传递。
    --------------------------------------------------------------------------------------
    '''
    def open(self):
        pass
    def open(self, scripts_js, pages=1, url='', params=None,
             wait_for='', page_element='', headers='', annoucement_type=''):
        if url == '':
            url = self.config['class'][self.class_name()]['url']
            if params == None:
                params = self.config['class'][self.class_name()]['param']
            if len(params) > 0:
                for param in params:
                    url = self.replace(url, param, params[param])
        if wait_for == '':
            wait_for = self.config['class'][self.class_name()]['wait_for']
        if page_element =='':
            page_element = self.config['class'][self.class_name()]['page_element']
        if headers == '':
            headers = self.config['class'][self.class_name()]['headers']
        scripts = self.script()
        scripts = self.set_params_for_lua(scripts, {
            'pages':str(pages),
            'url':url,
            'wait_for':wait_for,
            'page_element':page_element,
            # 这个解析器要从通过参数传递
            'scripts_js': scripts_js,
            'announcement_type':annoucement_type
        })
        # print(scripts)
        data = json.dumps({'lua_source':scripts})
        splash_url = 'http://' + self.config['server'] + ':' + self.config['port'] + '/execute'
        r = HTMLSession().post(splash_url, headers=headers, data=data)
        return r
--- a/splash/scripts/main.lua
+++ b/splash/scripts/main.lua
@ -0,0 +1,75 @@
 -- 本文件是页面抓取的主入口
 -- 这里必须采用加载模块的方法，否则好像不能动态加载js文件
 parser = require('parser')
 function main(splash, args)
  pages = {{$pages}}
  scripts_js = '{{$scripts_js}}'
  page_element = '{{$page_element}}'
  wait_for = '{{$wait_for}}'
  announcement_type = '{{$announcement_type}}'
  splash:go('{{$url}}')
  wait_for_element(splash, wait_for)
  wait_for_element(splash, page_element)
  -- 设置javascript脚本参数
  results = {}
  params_js = {}
  params_js['announcement_type'] = announcement_type
  -- 将第一页的结果加入返回结果集中
  result = parser.select(splash, scripts_js, params_js)
  table.insert(results, result)
  if pages == 1 then
    return results
  else
    -- 执行翻页动作
    -- 先页面上的翻页元件（element），然后发送点击事件（click()）翻页
    for i = 2, pages do
      -- 执行翻页脚本
      -- js 中是javascript脚本，用于获取翻页的元件，并发送click事件
      js = string.format("document.querySelector('%s').click();", page_element)
      splash:runjs(js)
      -- 等待页面加载完成
      wait_for_element(splash, wait_for)
      wait_for_element(splash, page_element)
      -- 这个地方看来必须加上延时，否则页面加载不完全，可能还没有完成页面更新
      assert(splash:wait(5))
      result = parser.select(splash, scripts_js, params_js)
      table.insert(results, result)
    end
    return results
  end
 end
 function wait_for_element(splash, css, maxwait)
  -- Wait until a selector matches an element
  -- in the page. Return an error if waited more
  -- than maxwait seconds.
  if maxwait == nil then
    maxwait = 10
  end
  return splash:wait_for_resume(string.format([[
    function main(splash) {
      var selector = '%s';
      var maxwait = %s;
      var end = Date.now() + maxwait*1000;
      function check() {
        if(document.querySelector(selector)) {
          splash.resume('Element found');
        } else if(Date.now() >= end) {
          var err = 'Timeout waiting for element';
          splash.error(err + " " + selector);
        } else {
          setTimeout(check, 200);
        }
      }
      check();
    }
  ]], css, maxwait))
 end
--- a/splash/scripts/modules/jquery-3.7.1.min.js
+++ b/splash/scripts/modules/jquery-3.7.1.min.js
--- a/splash/scripts/modules/parser.lua
+++ b/splash/scripts/modules/parser.lua
@ -0,0 +1,28 @@
 -- 文件名为 module.lua
 -- 定义一个名为 module 的模块
 parser = {}
 function set_params(scripts, params_js)
    for param, value in pairs(params_js) do
        scripts = scripts.gsub(scripts, "{{(%s*)$" .. param .. "(%s*)}}", value)
    end
    --scripts = scripts.gsub('123456  aaaa  123456', "[\s\\\]*aaaa\\\[\\\\s\\\]*", 'bbbb')
    return scripts
 end
 -- 定义一个函数
 function parser.select(splash, scripts_js, params_js)
    local file = io.open("/etc/splash/lua_modules/jquery-3.7.1.min.js", "r")
    splash:runjs(file:read('*a'))
    file:close()
    file = assert(io.open("/etc/splash/lua_modules/"..scripts_js..".js", "r"))
    scripts = file:read('*a')
    scripts = set_params(scripts, params_js)
    local js = splash:jsfunc(scripts)
    file:close()
    return js()
 end
 return parser
--- a/splash/scripts/modules/ygcg.js
+++ b/splash/scripts/modules/ygcg.js
@ -0,0 +1,32 @@
 function () {
    title = '';
    url = '';
    updateTime = '';
    region = '';
    announcementType = '';
    results = {};
    lists = new Array();
    // 取列表的头
    ul = $('#app > div > div.z_list_vue > div.ant-spin-nested-loading > div > div > div.z_content > div.z_detail_content > div:nth-child(5) > div.ant-spin-nested-loading > div > ul');
    // 获取列表的第一个元素，获取成功的话，元素封装对象的length = 1
    li = ul.children('li').first()
    item = {}
    while (li.length == 1)
    {
        a = li.find('div.ant-list-item-meta > div > h4 > span > a');
        item.title = $(a.children()['2']).attr('title');
        item.url = a.attr('href');
        item.updateTime = $(li.children()[1]).text();
        item.region = '宁波阳光采购';
        item.announcementType = '{{$announcement_type}}'
        lists.push(item)
        // 取下一个列表元素
        li = li.next()
    }
    results.count = lists.length
    results.lists = lists
    return results
 }
--- a/splash/scripts/modules/zepto.js
+++ b/splash/scripts/modules/zepto.js
--- a/splash/scripts/modules/zepto.js.1
+++ b/splash/scripts/modules/zepto.js.1