Browse Source

first commit

master
chen jinqian 6 months ago
commit
1fb57e6877
  1. 3
      .idea/.gitignore
  2. 8
      .idea/misc.xml
  3. 8
      .idea/modules.xml
  4. 6
      .idea/redis-manager-config.xml
  5. 6
      .idea/vcs.xml
  6. 9
      PyGuoyan.iml
  7. 14
      SYgcg.py
  8. BIN
      __pycache__/crawler.cpython-310.pyc
  9. BIN
      __pycache__/gymailer.cpython-310.pyc
  10. BIN
      __pycache__/properties.cpython-310.pyc
  11. 21
      cert.crt
  12. 5
      config.ini
  13. 714
      crawler.py
  14. 137
      dbsearch.py
  15. 59
      gycrawler.py
  16. 44
      gymailer.py
  17. 46
      jdbc.properties
  18. 75
      main.py
  19. BIN
      myrec.db
  20. 72
      properties.py
  21. 14
      splash/SYgcg.py
  22. 0
      splash/__init__.py
  23. BIN
      splash/__pycache__/SYgcg.cpython-310.pyc
  24. BIN
      splash/__pycache__/__init__.cpython-310.pyc
  25. BIN
      splash/__pycache__/gysplash.cpython-310.pyc
  26. 7
      splash/config/splash.cnf
  27. 20
      splash/config/splash.json
  28. 1
      splash/configscripts/main.lua
  29. 122
      splash/gysplash.py
  30. 75
      splash/scripts/main.lua
  31. 2
      splash/scripts/modules/jquery-3.7.1.min.js
  32. 28
      splash/scripts/modules/parser.lua
  33. 32
      splash/scripts/modules/ygcg.js
  34. 1650
      splash/scripts/modules/zepto.js
  35. 1650
      splash/scripts/modules/zepto.js.1

3
.idea/.gitignore

@ -0,0 +1,3 @@
# Default ignored files
/shelf/
/workspace.xml

8
.idea/misc.xml

@ -0,0 +1,8 @@
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.9 (PyGuoyan)" />
</component>
<component name="ProjectRootManager">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>

8
.idea/modules.xml

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/PyGuoyan.iml" filepath="$PROJECT_DIR$/PyGuoyan.iml" />
</modules>
</component>
</project>

6
.idea/redis-manager-config.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PersistentConfig">
<option name="langCode" value="en" />
</component>
</project>

6
.idea/vcs.xml

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

9
PyGuoyan.iml

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager" inherit-compiler-output="true">
<exclude-output />
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.9 (PyGuoyan)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

14
SYgcg.py

@ -0,0 +1,14 @@
#!/usr/bin/python3
from splash.gysplash import SBase
import json
class SYgcg(SBase):
def open(self):
return super().open('ygcg', pages=2, annoucement_type='政府采购')
if __name__ == '__main__':
test = SYgcg()
r = test.open()
results = json.loads(r.text)
print(results)

BIN
__pycache__/crawler.cpython-310.pyc

Binary file not shown.

BIN
__pycache__/gymailer.cpython-310.pyc

Binary file not shown.

BIN
__pycache__/properties.cpython-310.pyc

Binary file not shown.

21
cert.crt

@ -0,0 +1,21 @@
-----BEGIN CERTIFICATE-----
MIIDdTCCAl2gAwIBAgILBAAAAAABFUtaw5QwDQYJKoZIhvcNAQEFBQAwVzELMAkG
A1UEBhMCQkUxGTAXBgNVBAoTEEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jv
b3QgQ0ExGzAZBgNVBAMTEkdsb2JhbFNpZ24gUm9vdCBDQTAeFw05ODA5MDExMjAw
MDBaFw0yODAxMjgxMjAwMDBaMFcxCzAJBgNVBAYTAkJFMRkwFwYDVQQKExBHbG9i
YWxTaWduIG52LXNhMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJHbG9iYWxT
aWduIFJvb3QgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDaDuaZ
jc6j40+Kfvvxi4Mla+pIH/EqsLmVEQS98GPR4mdmzxzdzxtIK+6NiY6arymAZavp
xy0Sy6scTHAHoT0KMM0VjU/43dSMUBUc71DuxC73/OlS8pF94G3VNTCOXkNz8kHp
1Wrjsok6Vjk4bwY8iGlbKk3Fp1S4bInMm/k8yuX9ifUSPJJ4ltbcdG6TRGHRjcdG
snUOhugZitVtbNV4FpWi6cgKOOvyJBNPc1STE4U6G7weNLWLBYy5d4ux2x8gkasJ
U26Qzns3dLlwR5EiUWMWea6xrkEmCMgZK9FGqkjWZCrXgzT/LCrBbBlDSgeF59N8
9iFo7+ryUp9/k5DPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8E
BTADAQH/MB0GA1UdDgQWBBRge2YaRQ2XyolQL30EzTSo//z9SzANBgkqhkiG9w0B
AQUFAAOCAQEA1nPnfE920I2/7LqivjTFKDK1fPxsnCwrvQmeU79rXqoRSLblCKOz
yj1hTdNGCbM+w6DjY1Ub8rrvrTnhQ7k4o+YviiY776BQVvnGCv04zcQLcFGUl5gE
38NflNUVyRRBnMRddWQVDf9VMOyGj/8N7yy5Y0b2qvzfvGn9LhJIZJrglfCm7ymP
AbEVtQwdpf5pLGkkeB6zpxxxYu7KyJesF12KwvhHhm4qxFYxldBniYUr+WymXUad
DKqC5JlR3XC321Y9YeRq4VzW9v493kHMB65jUr9TU/Qr6cf9tveCX4XSQRjbgbME
HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A==
-----END CERTIFICATE-----

5
config.ini

@ -0,0 +1,5 @@
[database]
host = localhost
database = guoyantest
user = root
password = Guoyan83086775

714
crawler.py

@ -0,0 +1,714 @@
#!/usr/bin/python3
"""
===========================================================================================
这是一个用于爬取采购信息的模块
要处理采购公告信息主要涉及sc_cggg, calalog, catalogdata, readlog四张表
===========================================================================================
class Crawler:
def __init__(self, connect):
def generate_id(self):
def write_log_information(self, data_id, catalog_name):
def CrawlPage_gzw_ningbo(self, page): # 宁波国资委市属国企招标投标信息
def CrawlPage_zjcs_nbxzfw(self, type, page): # 宁波市中介超市
def CrawlPage_ygcg_nbcqjy_org(self, page): # 宁波市阳光采购
def CrawlPage_zfcg_czt_zj(self, page): # 浙江政府采购网
def CrawlPage_cbbidding(self, page): # 宁波中基国际招标有限公司
def CrawlPage_zmeetb(self, page): # 浙江国际招标有限公司
def CrawlPage_nbbidding(self, page): # 宁波国际招标有限公司
============================================================================================
"""
import datetime
import hashlib
import pymysql
import json
import random
from requests_html import HTMLSession
from requests_html import HTML, UserAgent
import gymailer
import time
'''
============================================================
这个类用来封装splash服务
其中:
self.splash_ip 参数是splash服务的ip
============================================================
'''
class Splash:
def __init__(self):
self.splash_ip = '127.0.0.1'
'''
============================================================
wait_for参数用来制定需要等待的元素只有该元素渲染完成程序才能染回否则将等待200秒wait_for 参数采购选择器的方式
如制定元素id, 采用#app"形式,如制定元素class, 采用 '.class-name'形式。
============================================================
'''
def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}):
lua_scripts = """
function wait_for_element(splash, css, maxwait)
-- Wait until a selector matches an element
-- in the page. Return an error if waited more
-- than maxwait seconds.
if maxwait == nil then
maxwait = 10
end
return splash:wait_for_resume(string.format([[
function main(splash) {
var selector = '%s';
var maxwait = %s;
var end = Date.now() + maxwait*1000;
function check() {
if(document.querySelector(selector)) {
splash.resume('Element found');
} else if(Date.now() >= end) {
var err = 'Timeout waiting for element';
splash.error(err + " " + selector);
} else {
setTimeout(check, 200);
}
}
check();
}
]], css, maxwait))
end
function main(splash, args)
pages = """ + str(pages) + """
page_element = '""" + page_element + """'
wait_for = '""" + wait_for + """'
splash:go('""" + url + """')
wait_for_element(splash, wait_for)
wait_for_element(splash, page_element)
-- 将第一页的结果加入返回结果集中
results = {splash.html()}
if pages == 1 then
return results
else
-- 执行翻页动作
-- 先页面上的翻页元件element然后发送点击事件click()翻页
for i = 2, pages do
-- js 中是javascript脚本用于获取翻页的元件并发送click事件
js = string.format("document.querySelector('%s').click();", page_element)
-- 执行翻页脚本
splash:runjs(js)
-- 等待页面加载完成
wait_for_element(splash, wait_for)
wait_for_element(splash, page_element)
-- 这个地方看来必须加上延时否则页面加载不完全可能还没有完成页面更新
assert(splash:wait(5))
-- 将页面加入返回结果集中
table.insert(results, splash.html())
end
return results
end
end
"""
splash_url = 'http://' + self.splash_ip + ':8050/execute'
data = json.dumps({'lua_source':lua_scripts})
r = HTMLSession().post(splash_url, headers=headers, data=data)
return r
class Crawler:
def __init__(self, connect):
self.connect = connect
def generate_id(self):
# 用于生成一个32位的ID号
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000))
md5_hash = hashlib.md5()
md5_hash.update(current_time.encode('utf-8'))
return md5_hash.hexdigest()
def write_log_information(self, data_id, catalog_name, log_type='采购公告'):
# 添加了一条信息,需要同步更新其他相关信息, 包含对话框信息和日志信息两项
with self.connect.cursor() as cursor:
affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type))
if affected_row == 0:
return False
result = cursor.fetchall()
catalog_id = result[0][0]
catalogdata_id = self.generate_id()
readlog_id = self.generate_id()
affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'")
if affected_row == 0:
return False
result = cursor.fetchall()
staff_id = result[0][0]
add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
affected_row = cursor.execute(
'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)',
(catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0))
cursor.execute(
'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)',
(readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id))
return True
def write_information(self, title, url, region, publishTime, announcementType):
# 用于将一条信息写入数据库中
with self.connect.cursor() as cursor:
cggg_id = self.generate_id()
try:
title = title.replace("'", "\\\'")
affected_rows = cursor.execute(
'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)',
(cggg_id, title, url, region, publishTime, announcementType))
except pymysql.err.IntegrityError:
print('信息重复')
self.connect.rollback()
return False
else:
if self.write_log_information(cggg_id, announcementType):
self.connect.commit()
else:
print('添加采购信息失败')
self.connect.rollback()
return False
return True
def write_information_cgyx(self, cgyx):
# 用于将一条信息写入数据库中
with self.connect.cursor() as cursor:
cgyx_id = self.generate_id()
cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'")
strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')'
try:
affected_rows = cursor.execute(strSql)
except pymysql.err.IntegrityError:
print('信息重复')
#self.connect.rollback()
return False
else:
if self.write_log_information(cgyx_id, '采购意向'):
self.connect.commit()
else:
print('添加采购信息失败')
self.connect.rollback()
return False
return True
def Check(self):
with self.connect.cursor() as cursor:
affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);")
if affected_row == 0:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常,请检查!')
return False
else:
return True
def Crawl(self):
# 这个方法是实际完成爬取工作的总入口。
# 爬取浙江政采网的信息
print('开始获取浙江政采网的信息\n')
# 定义要传递进去的关于公告信息类型的数据结构
infoType = [
{"announcementCode": "110-175885", "announcementType":"采购意向"},
{"announcementCode": "110-978863", "announcementType":"采购公告"},
{"announcementCode": "110-943756", "announcementType":"更正公告"},
{"announcementCode": "110-420383", "announcementType":"非政府采购公告"},
{"announcementCode": "110-900461", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 11):
try:
self.CrawlPage_zfcg_czt_zj(page, typeParam)
except Exception as e:
print('3--------------------------------', e)
# 爬取宁波市阳光采购网的信息
print('开始获取宁波市阳光采购网的信息\n')
infoType = [
{"announcementCode": "21", "announcementType":"采购公告"},
{"announcementCode": "23", "announcementType":"更正公告"},
{"announcementCode": "22", "announcementType":"结果公告"}
]
for typeParam in infoType:
try:
self.CrawlPage_ygcg_nbcqjy_org(2, typeParam)
except Exception as e:
print('4--------------------------------', e)
# 爬取宁波市中介超市网的信息
print('开始获取宁波市中介超市网的信息\n')
infoType = [
{"announcementCode": '1', "announcementType":"项目需求公告"},
{"announcementCode": '2', "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 6):
try:
self.CrawlPage_zjcs_nbxzfw(page, typeParam)
except Exception as e:
print('5------------------------------', e)
# 爬取宁波市国资委市属企业采购信息
print('开始获取宁波市国资委市属企业招投标网的信息\n')
for page in range(1, 5):
try:
self.CrawlPage_gzw_ningbo(page)
except Exception as e:
print('6------------------------------', e)
# 爬取宁波中基国际招标网的信息
print('开始获取宁波中基国际招标网的信息\n')
infoType = [
{"announcementCode": "22", "announcementType":"采购公告"},
{"announcementCode": "23", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 6):
try:
self.CrawlPage_cbbidding(page, typeParam)
except Exception as e:
print('7--------------------------------', e)
# 爬取浙江国际招标网的信息
print('开始获取浙江国际招标网的信息\n')
infoType = [
{"announcementCode": "Zbgg", "announcementType":"采购公告"},
{"announcementCode": "Gzgg", "announcementType":"更正公告"},
{"announcementCode": "jggg", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 5):
try:
self.CrawlPage_zmeetb(page, typeParam)
except Exception as e:
print('8----------------------------', e)
# 爬取宁波市国际招标有限公司网站
print('开始获取宁波国际招标网的信息\n')
# 定义要传递进去的关于公告信息类型的数据结构
infoType = [
{"announcementCode": "1", "announcementType":"采购公告"},
{"announcementCode": "1", "announcementType":"结果公告"},
{"announcementCode": "2", "announcementType":"采购公告"},
{"announcementCode": "2", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 5):
try:
self.CrawlPage_nbbidding(page, typeParam)
except Exception as e:
print('9--------------------------------', e)
# 爬取宁波名诚招标代理有限公司网站
print('开始获取宁波名城招标的信息\n')
# 定义要传递进去的关于公告信息类型的数据结构
infoType = [
{"announcementCode": "99", "announcementType":"采购公告"},
{"announcementCode": "88", "announcementType":"结果公告"}
]
for typeParam in infoType:
for page in range(1, 2):
try:
self.CrawlPage_nbmcbidding(page, typeParam)
except Exception as e:
print('10--------------------------------', e)
# 宁波中基国际招标有限公司 https://www.cbbidding.com/
def CrawlPage_cbbidding(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
session.DEFAULT_RETRIES = 5
url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page)
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
# 这个网站返回的是一个网页,所以需要进行网页解析
r = session.get(url = url, headers = headers)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li')
for item in data:
title = item.xpath('//a')[0].text
url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href')
region = '中基招标'
publishDate = item.xpath('//div')[0].text
try:
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
except Exception as e:
publishDate = publishDate.replace('.', '-')
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d'))
print(url, title)
announcementType = typeParam['announcementType']
#print(title, url, region, publishDate, announcementType)
self.write_information(title, url, region, publishDate, announcementType)
# 浙江国际招投标有限公司 https://www.zmeetb.com/
def CrawlPage_zmeetb(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/' + str(page) + '.html'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Cache-Control": "max-age=0",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "close",
"DNT": '1',
"Host": "www.zmeetb.com",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
# 这个网站返回的是一个网页,所以需要进行网页解析
# 这个网站如果使用render()函数,会遇到ssl证书问题,需要进一步研究chromium浏览器的证书问题
#r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com')
r = session.get(url = url, headers = headers, verify=False)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a')
for item in data:
title = item.xpath('//p')[0].text
url = item.attrs.get('href')
region = '浙江国际招标'
publishDate = item.xpath('//p')[1].text
announcementType = typeParam['announcementType']
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/
def CrawlPage_nbmcbidding(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
if typeParam['announcementType'] == '采购公告':
url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/"
else:
url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/"
data = {}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Host": "www.nbmcbidding.com",
'Connection': 'keep-alive',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
r = session.get(url = url, headers = headers, json = data)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li')
for item in data:
title = item.xpath('//a/div[2]')[0].text
url = item.xpath('//a')[0].attrs.get('href')
region = '宁波名诚招标'
publishDate = item.xpath('//a/div[4]')[0].text
announcementType = typeParam['announcementType']
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市国际招标有限公司 http://www.nbbidding.com/
def CrawlPage_nbbidding(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
if typeParam['announcementType'] == '采购公告':
url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword"
else:
url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword"
data = {}
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Host": "www.nbbidding.com",
'Connection': 'keep-alive',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
r = session.get(url = url, headers = headers, json = data)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text)
return False
data = json.loads(r.text)['data']
total = data['page']['count']
data = data['list']
for item in data:
id = item['id']
if typeParam['announcementType'] == '采购公告':
url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id)
else:
url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id)
title = item['title']
region = '宁波国际招标'
publishDate = item['addtime']
announcementType = item['stage']
self.write_information(title, url, region, publishDate, announcementType)
print(publishDate, title, url)
# 宁波市国资委属企业招标信息网
def CrawlPage_gzw_ningbo(self, page):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page)
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
# 这个网站返回的是一个网页,所以需要进行网页解析
r = session.get(url = url, headers = headers)
r.html.render()
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text)
return False
# 注意:xpath 函数返回的是list对象, 对象的元素是element
data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li')
for item in data:
title = item.xpath('//a')[0].text
url = item.xpath('//a')[0].attrs.get('href')
region = '宁波市属国企'
publishDate = item.xpath('//p')[0].text
announcementType = '采购公告'
self.write_information(title, url, region, publishDate, announcementType)
# 宁波市中介超市网
def CrawlPage_zjcs_nbxzfw(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
# type 用于判别采购信息的类型
session = HTMLSession()
urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15']
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"DNT": '1',
"Host": "ygcg.nbcqjy.org",
"Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type),
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306"
}
for url in urllist:
r = session.get(url = url, headers = headers)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text)
return False
data = json.loads(r.text)['data']
total = data['total']
data = data['rows']
for item in data:
articleId = item['AutoId']
BulletinTypeId = item['BulletinTypeId']
url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId)
title = item['BulletinTitle']
region = '宁波中介超市'
publishDate = item['PublishDate']
announcementType = typeParam['announcementType']
self.write_information(title, url, region, publishDate, announcementType)
#print(publishDate, url)
# 宁波阳光采购网
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam):
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType=' + typeParam['announcementCode']
wait_for = '.ant-pagination-item-ellipsis'
page_element = '.anticon-right'
try:
r = Splash().post(url, wait_for, pages=pages, page_element=page_element)
except Exception as e:
print(e)
results = json.loads(r.text)
# 这个方法是实际爬取指定页面的信息。
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码:'+str(r.status_code), r.text)
return False
for i in range(1, pages + 1):
data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li')
if len(data) == 0:
print('数据为空')
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e)
return False
for item in data:
url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href')
title = item.xpath('//a/span[3]')[0].text
region = '宁波阳光采购'
publishDate = item.xpath('//div[2]')[0].text
announcementType = typeParam['announcementType']
print(title)
self.write_information(title, url, region, publishDate, announcementType)
# 浙江政府采购网
def CrawlPage_zfcg_czt_zj(self, page, typeParam):
# 这个方法是实际爬取指定页面的信息。
session = HTMLSession()
url = 'https://zfcg.czt.zj.gov.cn/portal/category'
if typeParam['announcementCode'] == '110-420383':
data = {
"pageNo": page,
"pageSize": 15,
"categoryCode": typeParam['announcementCode'],
"districtCode": ["339900"],
"isProvince": True,
"includeGovDistrict": "1",
"_t": 1699104836000
}
else:
data = {
"pageNo": page,
"pageSize": 15,
"categoryCode": typeParam['announcementCode'],
"isGov": True,
"excludeDistrictPrefix": "90",
"_t": 1699104836000
}
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
"content-type": "application/json;charset=UTF-8",
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-requested-with": "XMLHttpRequest"
}
try:
r = session.post(url = url, headers = headers, json = data)
except Exception as e:
print('10-------------------------', e)
if r.status_code != 200:
if page == 1:
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text)
return False
data = json.loads(r.text)['result']['data']
total = data['total']
data = data['data']
for item in data:
publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000)
pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c'
detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId']
announcementType = typeParam['announcementType']
if announcementType == '采购意向':
r = session.get(url = detailUrl, headers = headers)
detailData = json.loads(r.text)['result']['data']
if detailData == None:
break
content = HTML(html='<xml>'+detailData['content']+'</xml>')
region = item['districtName']
for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'):
title = detailItem.xpath('//td[2]')[0].text
cgxqqk = detailItem.xpath('//td[3]')[0].text
ysje = detailItem.xpath('//td[4]')[0].text
yjcgsj = detailItem.xpath('//td[5]')[0].text
ly = detailData["title"]
self.write_information(title, pageUrl, region, publishDate, announcementType)
self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly})
else:
title = item['title']
region = item['districtName']
self.write_information(title, pageUrl, region, publishDate, announcementType)
#print(publishDate, url)
return True

137
dbsearch.py

@ -0,0 +1,137 @@
#!/usr/bin/python3
import pymysql
from properties import Properties
import sys, getopt
class DbSearch:
# 本类用于提供各类数据库信息搜索服务
def __init__(self, connect):
self.connect = connect
def GetTableList(self, database):
# 查询某个库的数据表的列表
cursorTable = self.connect.cursor()
cursorTable.execute("SELECT table_name FROM INFORMATION_SCHEMA.TABLES where table_schema = '" + database + "'");
return cursorTable.fetchall()
def GetColumnList(self, tableName):
# 查询某张表的数据字段列表
cursorColumn = self.connect.cursor()
cursorColumn.execute("SELECT column_name,data_type FROM INFORMATION_SCHEMA.COLUMNS where table_schema='" + database + "' AND table_name='" +
tableName + "'");
return cursorColumn.fetchall()
def SearchTableByColumnName(self, columnName, database):
# 查询包含包含searchText的库表
tableList = self.GetTableList(database)
findList = list()
for table in tableList:
columnList = self.GetColumnList(table[0])
for column in columnList:
if column[0].find(columnName) != -1:
findList.append(table[0])
return findList
def SearchTableByText(self, searchText, database):
# 查找包含searchText字符串的表,并显示相应的表记录
tableList = self.GetTableList(database)
if len(tableList) == 0:
return False
found = 0
findList = list()
for table in tableList:
strSql = "SELECT '" + table[0] + "' as table_name, t.* "
strSql = strSql + " FROM " + database + "." + table[0] + " as t where " + "("
columnList = self.GetColumnList(table[0])
i = 0
count = len(columnList)
for column in columnList:
# 如果字段数据类型为非文本型,跳过
if not column[1] in ('varchar', 'char', 'text'):
continue
i += 1
if i > 1:
strSql += " or "
strSql += column[0] + " like '%" + searchText + "%' "
strSql += ")"
cursorColumn = self.connect.cursor()
try:
cursorColumn.execute(strSql)
except Exception as e:
print('2----------------------------', database, strSql)
print("-----错误信息:-----\n", e)
return False
result = cursorColumn.fetchall()
if len(result) > 0:
findList.append(table[0])
print("==========================================================================")
print(table[0], result, strSql)
return findList
if __name__ == '__main__':
print(
"""
============================================================
|这是数据库全文检索工具包含两个参数 |
============================================================
""")
# 设置运行环境。如果当前是测试环境,则将is_test设置为true
is_test = False
if is_test:
file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties"
database = 'guoyantest'
else:
file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties"
database = 'guoyan'
# 打开jdbc.properties文件,获取数据库的配置信息
props = Properties(file_path)
host = 'localhost'
user = props.get('jdbc.username')
password = props.get('jdbc.password')
# 打开数据连接
db = pymysql.connect(host = host, user = user, password = password, database = database)
# 获取命令行参数
keyword = ''
searchType =''
keyword = ''
searchType = ''
try:
opts, args = getopt.getopt(sys.argv[1:],"hT:k:",["keyword=","searchType="])
except getopt.GetoptError:
print(sys.argv[0] + ' -k <keyword> -T <searchType>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('3--------------------', 'test.py -k <keyword> -T <searchType>')
sys.exit()
elif opt in ("-k", "--keyword"):
keyword = arg
elif opt in ("-T", "--searchType"):
searchType = arg
dbSearch = DbSearch(db)
if searchType == '0':
print('正在根据您输入的关键词查找表.....................')
print('found tables: ', dbSearch.SearchTableByText(keyword, database))
elif searchType == '1':
print('正在根据您输入的列名查找表.....................')
print('found tables: ', dbSearch.SearchTableByColumnName(keyword, database))

59
gycrawler.py

@ -0,0 +1,59 @@
#!/usr/bin/python3
"""这是爬虫的主程序主程序
作者陈进钱
日期2023/11/03
"""
import pymysql
import datetime
import time
from apscheduler.schedulers.blocking import BlockingScheduler
from properties import Properties
from crawler import Crawler
print(
"""采购信息采集器 v1.0
===================================================================================
这个程序用于获取各大招投标网站的采购信息
version: 1.0
作者陈进钱
日期2023-11-04
===================================================================================""")
# 设置运行环境。如果当前是测试环境,则将is_test设置为true
is_test = False
if is_test:
file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties"
database = 'guoyantest'
else:
file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties"
database = 'guoyan'
# 打开jdbc.properties文件,获取数据库的配置信息
props = Properties(file_path)
host = 'localhost'
user = props.get('jdbc.username')
password = props.get('jdbc.password')
# 打开数据连接
connect = pymysql.connect(host = host, user = user, password = password, database = database)
# 获取采购信息,并填写到数据库中
crawler = Crawler(connect)
# 启动自动爬取任务
def crawl_check_func():
crawler.Check()
# 启动自动爬取任务
def crawl_job_func():
crawler.Crawl()
sched = BlockingScheduler()
sched.add_job(crawl_job_func, 'interval', hours=3, jitter=120, max_instances=4)
sched.add_job(crawl_check_func, 'interval', days=1, jitter=120, max_instances=4)
sched.start()
# 关闭数据库连接
connect.close()

44
gymailer.py

@ -0,0 +1,44 @@
import smtplib
from email.mime.text import MIMEText
from email.header import Header
def SendMail(sender, receiver, subject, message):
# 发送邮件服务器
smtp_server = 'smtp.126.com'
# 发送邮件服务器端口
smtp_port = 465
# 邮件对象
msg = MIMEText(message, 'plain', 'utf-8')
msg['From'] = Header(sender, 'utf-8')
msg['To'] = Header(receiver, 'utf-8')
msg['Subject'] = Header(subject, 'utf-8')
# SMTP对象
smtpObj = smtplib.SMTP_SSL(smtp_server, smtp_port)
# 登录SMTP服务器
smtpObj.login(sender, 'ERXYFJRLKPTTDXWH')
# 发送邮件
smtpObj.sendmail(from_addr=sender,to_addrs=[receiver],msg=msg.as_string())
# 关闭SMTP连接
smtpObj.quit()
if __name__ == '__main__':
# 发件人邮箱
sender = 'jinqian_chen@126.com'
# 收件人邮箱
receiver = 'jinqian.chen@srit.com.cn'
# 邮件主题
subject = 'Python3发送邮件示例, new'
# 邮件正文
message = '这是一封Python3发送的邮件'
SendMail(sender, receiver, subject, message)

46
jdbc.properties

@ -0,0 +1,46 @@
#Db2
#hibernate.dialect=org.hibernate.dialect.DB2Dialect
#jdbc.driverClassName=com.ibm.db2.jcc.DB2Driver
#jdbc.url=jdbc:db2://localhost:50000/eaching
#Oracle
#hibernate.dialect=org.hibernate.dialect.Oracle10gDialect
#jdbc.driverClassName=oracle.jdbc.driver.OracleDriver
#jdbc.url=jdbc:oracle:thin:@47.99.208.214:1521:orcl
#jdbc.url=jdbc:oracle:thin:@118.190.161.36:1521:orcl
#SqlServer
#hibernate.dialect=org.hibernate.dialect.SQLServerDialect
#jdbc.driverClassName=net.sourceforge.jtds.jdbc.Driver
#jdbc.url=jdbc:jtds:sqlserver://localhost:1433/guanwaimatou;SelectMethod=Cursor
#MySql
hibernate.dialect=org.hibernate.dialect.MySQLDialect
jdbc.driverClassName=com.mysql.jdbc.Driver
jdbc.url=jdbc:mysql://116.62.210.190:3306/guoyantest?autoReconnect=true&useUnicode=true&characterEncoding=UTF8&mysqlEncoding=utf8&zeroDateTimeBehavior=convertToNull
jdbc.username=root
jdbc.password=Guoyan83086775
jdbc.maxConn=20
jdbc.minConn=5
jdbc.activeTime=900000
jdbc.alias=eaching
jdbc.keepingSleepTime=30000
jdbc.maxConnectionLifetime=60000
jdbc.multiSource=false
hibernate.cache.use_second_level_cache=true
hibernate.show_sql=false
hibernate.generate_statistics=false
hibernate.cache.provider_class=org.hibernate.cache.EhCacheProvider
#hibernate.cache.provider_class=net.oschina.j2cache.hibernate3.J2CacheProvider
hibernate.cache.use_minimal_puts=true
hibernate.cache.use_structured_entries=true
hibernate.cache.use_query_cache=true
hibernate.use_sql_comments=trues
hibernate.order_updates=true
hibernate.format_sql=false
hbm2ddl.auto=create

75
main.py

@ -0,0 +1,75 @@
#!/usr/bin/python3
"""这是爬虫的主程序主程序
作者陈进钱
日期2023/11/03
"""
import pymysql
import datetime
import time
from apscheduler.schedulers.blocking import BlockingScheduler
from properties import Properties
from crawler import Crawler
import sys
import os
print(
"""采购信息采集器 v1.0
===================================================================================
这个程序用于获取各大招投标网站的采购信息
version: 1.0
作者陈进钱
日期2023-11-04
===================================================================================""")
# 设置运行环境。如果当前是测试环境,则将is_test设置为true
is_test = True
if is_test:
root = "/opt/eresource_test/webapp/WEB-INF/classes/prod/"
else:
root = "/opt/eresource/webapp/WEB-INF/classes/prod/"
if os.path.exists(root):
file_path = root + "jdbc.properties"
else:
file_path = "jdbc.properties"
if sys.platform == 'win32':
host = '116.62.210.190'
user = 'root'
password = 'Guoyan83086775'
if is_test:
database = 'guoyantest'
else:
database = 'guoyan'
else:
if is_test:
database = 'guoyantest'
else:
database = 'guoyan'
# 打开jdbc.properties文件,获取数据库的配置信息
props = Properties(file_path)
host = '116.62.210.190'
user = props.get('jdbc.username')
password = props.get('jdbc.password')
# 打开数据连接
connect = pymysql.connect(host = host, user = user, password = password, database = database)
# 获取采购信息,并填写到数据库中
crawler = Crawler(connect)
crawler.Crawl()
#crawler.CrawlPage_ygcg_nbcqjy_org(1, {"announcementCode": "21", "announcementType":"采购公告"})
#print(crawler.Check())
# 启动自动爬取任务
#def crawl_job_func():
# crawler.Crawl()
#sched = BlockingScheduler()
#sched.add_job(crawl_job_func, 'interval', hours=1, jitter=120)
#sched.start()
# 关闭数据库连接
connect.close()

BIN
myrec.db

Binary file not shown.

72
properties.py

@ -0,0 +1,72 @@
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import re
import os
import tempfile
class Properties:
def __init__(self, file_name):
# 如果配置文件不存在,取本地文件
if not os.path.exists(file_name):
file_name = 'jdbc.properties'
self.file_name = file_name
self.properties = {}
try:
fopen = open(self.file_name, 'r')
for line in fopen:
line = line.strip()
if line.find('=') > 0 and not line.startswith('#'):
strs = line.split('=')
self.properties[strs[0].strip()] = strs[1].strip()
except Exception as e:
raise e
else:
fopen.close()
def has_key(self, key):
return key in self.properties
def get(self, key, default_value=''):
if key in self.properties:
return self.properties[key]
return default_value
def put(self, key, value):
self.properties[key] = value
replace_property(self.file_name, key + '=.*', key + '=' + value, True)
def replace_property(file_name, from_regex, to_str, append_on_not_exists=True):
tmpfile = tempfile.TemporaryFile()
if os.path.exists(file_name):
r_open = open(file_name, 'r')
pattern = re.compile(r'' + from_regex)
found = None
for line in r_open:
if pattern.search(line) and not line.strip().startswith('#'):
found = True
line = re.sub(from_regex, to_str, line)
tmpfile.write(line.encode())
if not found and append_on_not_exists:
tmpfile.write(('\n' + to_str).encode())
r_open.close()
tmpfile.seek(0)
content = tmpfile.read()
if os.path.exists(file_name):
os.remove(file_name)
w_open = open(file_name, 'wb')
w_open.write(content)
w_open.close()
tmpfile.close()
else:
print ("file %s not found" % file_name)

14
splash/SYgcg.py

@ -0,0 +1,14 @@
#!/usr/bin/python3
from splash.gysplash import SBase
import json
class SYgcg(SBase):
def open(self):
return super().open('ygcg', pages=2, annoucement_type='政府采购')
if __name__ == '__main__':
test = SYgcg()
r = test.open()
results = json.loads(r.text)
print(results)

0
splash/__init__.py

BIN
splash/__pycache__/SYgcg.cpython-310.pyc

Binary file not shown.

BIN
splash/__pycache__/__init__.cpython-310.pyc

Binary file not shown.

BIN
splash/__pycache__/gysplash.cpython-310.pyc

Binary file not shown.

7
splash/config/splash.cnf

@ -0,0 +1,7 @@
#This is splash config file.
[splash service settings]
server = localhost
port = 8050

20
splash/config/splash.json

@ -0,0 +1,20 @@
{
"description": "This is splash config file.",
"server": "127.0.0.1",
"port": "8050",
"class":{
"SYgcg":{
"url":"https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&noticeType={{ $noticeType }}",
"_comment":"http://www.baidu.com",
"param":{
"noticeType":"21"
},
"wait_for":".ant-list-items",
"page_element":".anticon-right",
"headers":{
"content-type":"application/json",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0"
}
}
}
}

1
splash/configscripts/main.lua

@ -0,0 +1 @@
This is lua script file

122
splash/gysplash.py

@ -0,0 +1,122 @@
#!/usr/bin/python3
'''===================================================================
这个模块用于对 splash 服务进行封装方便在 python 中使用
版本1.0
作者陈进钱
日期2023-12-18
==================================================================='''
import os
import re
import json
import codecs
import configparser
from requests_html import HTMLSession
from requests_html import HTML
config = configparser.ConfigParser()
# splash 基类
class SBase:
def __init__(self):
self.__lua_script = ''
self.config = {}
# 创建 ConfigParser 对象
self.root = os.path.dirname(os.path.abspath(__file__))
# 自动创建配置文件
dir = self.root + '/config'
if not os.path.exists(dir):
os.makedirs(dir)
file_path = self.root + '/config/splash.json'
if os.path.exists(file_path):
file = codecs.open(file_path, 'r', 'utf-8')
content = file.read()
self.config = json.loads(content)
file.close()
else:
self.config['description'] = 'This is splash config file.'
self.config['server'] = 'localhost'
self.config['port'] = '8050'
content = json.dumps(self.config)
with codecs.open(file_path, 'w', 'utf-8') as file:
file.write(content)
# 自动创建空的脚本文件
dir = self.root + '/scripts'
if not os.path.exists(dir):
os.makedirs(dir)
# 这个代码要更新为一个通用代码
file_path = dir + '/main.lua'
if os.path.exists(file_path):
file = codecs.open(file_path, 'r', 'utf-8')
self.__lua_script = file.read()
file.close()
else:
with codecs.open(file_path, 'w', 'utf-8') as file:
self.__lua_script = 'This is lua script file'
file.write(self.__lua_script)
def script(self):
return self.__lua_script;
def class_name(self):
return type(self).__name__;
def replace(self, source, param, value):
return re.sub('{{[\s]*\$' + param + '[\s]*}}', value, source)
# 向lua脚本传递参数变量
def set_params_for_lua(self, scripts, params):
for param in params:
scripts = self.replace(scripts, param, params[param])
return scripts
'''
--------------------------------------------------------------------------------------
本函数用于打开指定的网址具体的网址参数等待就绪的网页元件等待就绪的翻页元件
headers 等参数默认为空这些参数的任意一个为空时则从配置文件中的相关类名项下提取
本函数会调用 lua 主脚本文件执行页面解析的lua脚本文件该文件名称通过参数 parser 传递
--------------------------------------------------------------------------------------
'''
def open(self):
pass
def open(self, scripts_js, pages=1, url='', params=None,
wait_for='', page_element='', headers='', annoucement_type=''):
if url == '':
url = self.config['class'][self.class_name()]['url']
if params == None:
params = self.config['class'][self.class_name()]['param']
if len(params) > 0:
for param in params:
url = self.replace(url, param, params[param])
if wait_for == '':
wait_for = self.config['class'][self.class_name()]['wait_for']
if page_element =='':
page_element = self.config['class'][self.class_name()]['page_element']
if headers == '':
headers = self.config['class'][self.class_name()]['headers']
scripts = self.script()
scripts = self.set_params_for_lua(scripts, {
'pages':str(pages),
'url':url,
'wait_for':wait_for,
'page_element':page_element,
# 这个解析器要从通过参数传递
'scripts_js': scripts_js,
'announcement_type':annoucement_type
})
# print(scripts)
data = json.dumps({'lua_source':scripts})
splash_url = 'http://' + self.config['server'] + ':' + self.config['port'] + '/execute'
r = HTMLSession().post(splash_url, headers=headers, data=data)
return r

75
splash/scripts/main.lua

@ -0,0 +1,75 @@
-- 本文件是页面抓取的主入口
-- 这里必须采用加载模块的方法,否则好像不能动态加载js文件
parser = require('parser')
function main(splash, args)
pages = {{$pages}}
scripts_js = '{{$scripts_js}}'
page_element = '{{$page_element}}'
wait_for = '{{$wait_for}}'
announcement_type = '{{$announcement_type}}'
splash:go('{{$url}}')
wait_for_element(splash, wait_for)
wait_for_element(splash, page_element)
-- 设置javascript脚本参数
results = {}
params_js = {}
params_js['announcement_type'] = announcement_type
-- 将第一页的结果加入返回结果集中
result = parser.select(splash, scripts_js, params_js)
table.insert(results, result)
if pages == 1 then
return results
else
-- 执行翻页动作
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页
for i = 2, pages do
-- 执行翻页脚本
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件
js = string.format("document.querySelector('%s').click();", page_element)
splash:runjs(js)
-- 等待页面加载完成
wait_for_element(splash, wait_for)
wait_for_element(splash, page_element)
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新
assert(splash:wait(5))
result = parser.select(splash, scripts_js, params_js)
table.insert(results, result)
end
return results
end
end
function wait_for_element(splash, css, maxwait)
-- Wait until a selector matches an element
-- in the page. Return an error if waited more
-- than maxwait seconds.
if maxwait == nil then
maxwait = 10
end
return splash:wait_for_resume(string.format([[
function main(splash) {
var selector = '%s';
var maxwait = %s;
var end = Date.now() + maxwait*1000;
function check() {
if(document.querySelector(selector)) {
splash.resume('Element found');
} else if(Date.now() >= end) {
var err = 'Timeout waiting for element';
splash.error(err + " " + selector);
} else {
setTimeout(check, 200);
}
}
check();
}
]], css, maxwait))
end

2
splash/scripts/modules/jquery-3.7.1.min.js

File diff suppressed because one or more lines are too long

28
splash/scripts/modules/parser.lua

@ -0,0 +1,28 @@
-- 文件名为 module.lua
-- 定义一个名为 module 的模块
parser = {}
function set_params(scripts, params_js)
for param, value in pairs(params_js) do
scripts = scripts.gsub(scripts, "{{(%s*)$" .. param .. "(%s*)}}", value)
end
--scripts = scripts.gsub('123456 aaaa 123456', "[\s\\\]*aaaa\\\[\\\\s\\\]*", 'bbbb')
return scripts
end
-- 定义一个函数
function parser.select(splash, scripts_js, params_js)
local file = io.open("/etc/splash/lua_modules/jquery-3.7.1.min.js", "r")
splash:runjs(file:read('*a'))
file:close()
file = assert(io.open("/etc/splash/lua_modules/"..scripts_js..".js", "r"))
scripts = file:read('*a')
scripts = set_params(scripts, params_js)
local js = splash:jsfunc(scripts)
file:close()
return js()
end
return parser

32
splash/scripts/modules/ygcg.js

@ -0,0 +1,32 @@
function () {
title = '';
url = '';
updateTime = '';
region = '';
announcementType = '';
results = {};
lists = new Array();
// 取列表的头
ul = $('#app > div > div.z_list_vue > div.ant-spin-nested-loading > div > div > div.z_content > div.z_detail_content > div:nth-child(5) > div.ant-spin-nested-loading > div > ul');
// 获取列表的第一个元素,获取成功的话,元素封装对象的length = 1
li = ul.children('li').first()
item = {}
while (li.length == 1)
{
a = li.find('div.ant-list-item-meta > div > h4 > span > a');
item.title = $(a.children()['2']).attr('title');
item.url = a.attr('href');
item.updateTime = $(li.children()[1]).text();
item.region = '宁波阳光采购';
item.announcementType = '{{$announcement_type}}'
lists.push(item)
// 取下一个列表元素
li = li.next()
}
results.count = lists.length
results.lists = lists
return results
}

1650
splash/scripts/modules/zepto.js

File diff suppressed because it is too large

1650
splash/scripts/modules/zepto.js.1

File diff suppressed because it is too large
Loading…
Cancel
Save