commit
1fb57e6877
35 changed files with 4828 additions and 0 deletions
@ -0,0 +1,3 @@ |
|||||
|
# Default ignored files |
||||
|
/shelf/ |
||||
|
/workspace.xml |
@ -0,0 +1,8 @@ |
|||||
|
<project version="4"> |
||||
|
<component name="Black"> |
||||
|
<option name="sdkName" value="Python 3.9 (PyGuoyan)" /> |
||||
|
</component> |
||||
|
<component name="ProjectRootManager"> |
||||
|
<output url="file://$PROJECT_DIR$/out" /> |
||||
|
</component> |
||||
|
</project> |
@ -0,0 +1,8 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="ProjectModuleManager"> |
||||
|
<modules> |
||||
|
<module fileurl="file://$PROJECT_DIR$/PyGuoyan.iml" filepath="$PROJECT_DIR$/PyGuoyan.iml" /> |
||||
|
</modules> |
||||
|
</component> |
||||
|
</project> |
@ -0,0 +1,6 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="PersistentConfig"> |
||||
|
<option name="langCode" value="en" /> |
||||
|
</component> |
||||
|
</project> |
@ -0,0 +1,6 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<project version="4"> |
||||
|
<component name="VcsDirectoryMappings"> |
||||
|
<mapping directory="$PROJECT_DIR$" vcs="Git" /> |
||||
|
</component> |
||||
|
</project> |
@ -0,0 +1,9 @@ |
|||||
|
<?xml version="1.0" encoding="UTF-8"?> |
||||
|
<module type="PYTHON_MODULE" version="4"> |
||||
|
<component name="NewModuleRootManager" inherit-compiler-output="true"> |
||||
|
<exclude-output /> |
||||
|
<content url="file://$MODULE_DIR$" /> |
||||
|
<orderEntry type="jdk" jdkName="Python 3.9 (PyGuoyan)" jdkType="Python SDK" /> |
||||
|
<orderEntry type="sourceFolder" forTests="false" /> |
||||
|
</component> |
||||
|
</module> |
@ -0,0 +1,14 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
from splash.gysplash import SBase |
||||
|
import json |
||||
|
|
||||
|
class SYgcg(SBase): |
||||
|
def open(self): |
||||
|
return super().open('ygcg', pages=2, annoucement_type='政府采购') |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
test = SYgcg() |
||||
|
r = test.open() |
||||
|
|
||||
|
results = json.loads(r.text) |
||||
|
print(results) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,21 @@ |
|||||
|
-----BEGIN CERTIFICATE----- |
||||
|
MIIDdTCCAl2gAwIBAgILBAAAAAABFUtaw5QwDQYJKoZIhvcNAQEFBQAwVzELMAkG |
||||
|
A1UEBhMCQkUxGTAXBgNVBAoTEEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jv |
||||
|
b3QgQ0ExGzAZBgNVBAMTEkdsb2JhbFNpZ24gUm9vdCBDQTAeFw05ODA5MDExMjAw |
||||
|
MDBaFw0yODAxMjgxMjAwMDBaMFcxCzAJBgNVBAYTAkJFMRkwFwYDVQQKExBHbG9i |
||||
|
YWxTaWduIG52LXNhMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJHbG9iYWxT |
||||
|
aWduIFJvb3QgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDaDuaZ |
||||
|
jc6j40+Kfvvxi4Mla+pIH/EqsLmVEQS98GPR4mdmzxzdzxtIK+6NiY6arymAZavp |
||||
|
xy0Sy6scTHAHoT0KMM0VjU/43dSMUBUc71DuxC73/OlS8pF94G3VNTCOXkNz8kHp |
||||
|
1Wrjsok6Vjk4bwY8iGlbKk3Fp1S4bInMm/k8yuX9ifUSPJJ4ltbcdG6TRGHRjcdG |
||||
|
snUOhugZitVtbNV4FpWi6cgKOOvyJBNPc1STE4U6G7weNLWLBYy5d4ux2x8gkasJ |
||||
|
U26Qzns3dLlwR5EiUWMWea6xrkEmCMgZK9FGqkjWZCrXgzT/LCrBbBlDSgeF59N8 |
||||
|
9iFo7+ryUp9/k5DPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8E |
||||
|
BTADAQH/MB0GA1UdDgQWBBRge2YaRQ2XyolQL30EzTSo//z9SzANBgkqhkiG9w0B |
||||
|
AQUFAAOCAQEA1nPnfE920I2/7LqivjTFKDK1fPxsnCwrvQmeU79rXqoRSLblCKOz |
||||
|
yj1hTdNGCbM+w6DjY1Ub8rrvrTnhQ7k4o+YviiY776BQVvnGCv04zcQLcFGUl5gE |
||||
|
38NflNUVyRRBnMRddWQVDf9VMOyGj/8N7yy5Y0b2qvzfvGn9LhJIZJrglfCm7ymP |
||||
|
AbEVtQwdpf5pLGkkeB6zpxxxYu7KyJesF12KwvhHhm4qxFYxldBniYUr+WymXUad |
||||
|
DKqC5JlR3XC321Y9YeRq4VzW9v493kHMB65jUr9TU/Qr6cf9tveCX4XSQRjbgbME |
||||
|
HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A== |
||||
|
-----END CERTIFICATE----- |
@ -0,0 +1,5 @@ |
|||||
|
[database] |
||||
|
host = localhost |
||||
|
database = guoyantest |
||||
|
user = root |
||||
|
password = Guoyan83086775 |
@ -0,0 +1,714 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
""" |
||||
|
=========================================================================================== |
||||
|
这是一个用于爬取采购信息的模块 |
||||
|
要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表 |
||||
|
=========================================================================================== |
||||
|
class Crawler: |
||||
|
def __init__(self, connect): |
||||
|
def generate_id(self): |
||||
|
def write_log_information(self, data_id, catalog_name): |
||||
|
def CrawlPage_gzw_ningbo(self, page): # 宁波国资委市属国企招标投标信息 |
||||
|
def CrawlPage_zjcs_nbxzfw(self, type, page): # 宁波市中介超市 |
||||
|
def CrawlPage_ygcg_nbcqjy_org(self, page): # 宁波市阳光采购 |
||||
|
def CrawlPage_zfcg_czt_zj(self, page): # 浙江政府采购网 |
||||
|
def CrawlPage_cbbidding(self, page): # 宁波中基国际招标有限公司 |
||||
|
def CrawlPage_zmeetb(self, page): # 浙江国际招标有限公司 |
||||
|
def CrawlPage_nbbidding(self, page): # 宁波国际招标有限公司 |
||||
|
============================================================================================ |
||||
|
""" |
||||
|
|
||||
|
import datetime |
||||
|
import hashlib |
||||
|
import pymysql |
||||
|
import json |
||||
|
import random |
||||
|
from requests_html import HTMLSession |
||||
|
from requests_html import HTML, UserAgent |
||||
|
import gymailer |
||||
|
import time |
||||
|
|
||||
|
''' |
||||
|
============================================================ |
||||
|
这个类用来封装splash服务 |
||||
|
其中: |
||||
|
self.splash_ip 参数是splash服务的ip |
||||
|
============================================================ |
||||
|
''' |
||||
|
|
||||
|
class Splash: |
||||
|
def __init__(self): |
||||
|
self.splash_ip = '127.0.0.1' |
||||
|
|
||||
|
''' |
||||
|
============================================================ |
||||
|
wait_for参数用来制定需要等待的元素,只有该元素渲染完成,程序才能染回,否则将等待200秒。wait_for 参数采购选择器的方式,如 |
||||
|
如制定元素id, 采用“#app"形式,如制定元素class, 采用 '.class-name'形式。 |
||||
|
============================================================ |
||||
|
''' |
||||
|
def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}): |
||||
|
lua_scripts = """ |
||||
|
function wait_for_element(splash, css, maxwait) |
||||
|
-- Wait until a selector matches an element |
||||
|
-- in the page. Return an error if waited more |
||||
|
-- than maxwait seconds. |
||||
|
if maxwait == nil then |
||||
|
maxwait = 10 |
||||
|
end |
||||
|
return splash:wait_for_resume(string.format([[ |
||||
|
function main(splash) { |
||||
|
var selector = '%s'; |
||||
|
var maxwait = %s; |
||||
|
var end = Date.now() + maxwait*1000; |
||||
|
|
||||
|
function check() { |
||||
|
if(document.querySelector(selector)) { |
||||
|
splash.resume('Element found'); |
||||
|
} else if(Date.now() >= end) { |
||||
|
var err = 'Timeout waiting for element'; |
||||
|
splash.error(err + " " + selector); |
||||
|
} else { |
||||
|
setTimeout(check, 200); |
||||
|
} |
||||
|
} |
||||
|
check(); |
||||
|
} |
||||
|
]], css, maxwait)) |
||||
|
end |
||||
|
|
||||
|
function main(splash, args) |
||||
|
pages = """ + str(pages) + """ |
||||
|
page_element = '""" + page_element + """' |
||||
|
wait_for = '""" + wait_for + """' |
||||
|
splash:go('""" + url + """') |
||||
|
wait_for_element(splash, wait_for) |
||||
|
wait_for_element(splash, page_element) |
||||
|
|
||||
|
-- 将第一页的结果加入返回结果集中 |
||||
|
results = {splash.html()} |
||||
|
|
||||
|
if pages == 1 then |
||||
|
return results |
||||
|
else |
||||
|
-- 执行翻页动作 |
||||
|
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页 |
||||
|
for i = 2, pages do |
||||
|
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件 |
||||
|
js = string.format("document.querySelector('%s').click();", page_element) |
||||
|
|
||||
|
-- 执行翻页脚本 |
||||
|
splash:runjs(js) |
||||
|
|
||||
|
-- 等待页面加载完成 |
||||
|
wait_for_element(splash, wait_for) |
||||
|
wait_for_element(splash, page_element) |
||||
|
|
||||
|
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新 |
||||
|
assert(splash:wait(5)) |
||||
|
|
||||
|
-- 将页面加入返回结果集中 |
||||
|
table.insert(results, splash.html()) |
||||
|
end |
||||
|
return results |
||||
|
end |
||||
|
end |
||||
|
""" |
||||
|
|
||||
|
splash_url = 'http://' + self.splash_ip + ':8050/execute' |
||||
|
data = json.dumps({'lua_source':lua_scripts}) |
||||
|
r = HTMLSession().post(splash_url, headers=headers, data=data) |
||||
|
return r |
||||
|
|
||||
|
|
||||
|
class Crawler: |
||||
|
def __init__(self, connect): |
||||
|
self.connect = connect |
||||
|
|
||||
|
def generate_id(self): |
||||
|
# 用于生成一个32位的ID号 |
||||
|
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000)) |
||||
|
md5_hash = hashlib.md5() |
||||
|
md5_hash.update(current_time.encode('utf-8')) |
||||
|
return md5_hash.hexdigest() |
||||
|
|
||||
|
def write_log_information(self, data_id, catalog_name, log_type='采购公告'): |
||||
|
# 添加了一条信息,需要同步更新其他相关信息, 包含对话框信息和日志信息两项 |
||||
|
with self.connect.cursor() as cursor: |
||||
|
affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type)) |
||||
|
if affected_row == 0: |
||||
|
return False |
||||
|
|
||||
|
result = cursor.fetchall() |
||||
|
catalog_id = result[0][0] |
||||
|
catalogdata_id = self.generate_id() |
||||
|
readlog_id = self.generate_id() |
||||
|
|
||||
|
affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'") |
||||
|
if affected_row == 0: |
||||
|
return False |
||||
|
|
||||
|
result = cursor.fetchall() |
||||
|
staff_id = result[0][0] |
||||
|
add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
||||
|
|
||||
|
affected_row = cursor.execute( |
||||
|
'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)', |
||||
|
(catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0)) |
||||
|
|
||||
|
cursor.execute( |
||||
|
'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)', |
||||
|
(readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id)) |
||||
|
|
||||
|
return True |
||||
|
|
||||
|
|
||||
|
def write_information(self, title, url, region, publishTime, announcementType): |
||||
|
# 用于将一条信息写入数据库中 |
||||
|
with self.connect.cursor() as cursor: |
||||
|
cggg_id = self.generate_id() |
||||
|
|
||||
|
|
||||
|
try: |
||||
|
title = title.replace("'", "\\\'") |
||||
|
affected_rows = cursor.execute( |
||||
|
'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)', |
||||
|
(cggg_id, title, url, region, publishTime, announcementType)) |
||||
|
except pymysql.err.IntegrityError: |
||||
|
print('信息重复') |
||||
|
self.connect.rollback() |
||||
|
return False |
||||
|
else: |
||||
|
if self.write_log_information(cggg_id, announcementType): |
||||
|
self.connect.commit() |
||||
|
else: |
||||
|
print('添加采购信息失败') |
||||
|
self.connect.rollback() |
||||
|
return False |
||||
|
|
||||
|
return True |
||||
|
|
||||
|
def write_information_cgyx(self, cgyx): |
||||
|
# 用于将一条信息写入数据库中 |
||||
|
|
||||
|
with self.connect.cursor() as cursor: |
||||
|
cgyx_id = self.generate_id() |
||||
|
cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'") |
||||
|
strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')' |
||||
|
try: |
||||
|
affected_rows = cursor.execute(strSql) |
||||
|
except pymysql.err.IntegrityError: |
||||
|
print('信息重复') |
||||
|
#self.connect.rollback() |
||||
|
return False |
||||
|
else: |
||||
|
if self.write_log_information(cgyx_id, '采购意向'): |
||||
|
self.connect.commit() |
||||
|
else: |
||||
|
print('添加采购信息失败') |
||||
|
self.connect.rollback() |
||||
|
return False |
||||
|
|
||||
|
return True |
||||
|
|
||||
|
def Check(self): |
||||
|
with self.connect.cursor() as cursor: |
||||
|
affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);") |
||||
|
if affected_row == 0: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常,请检查!') |
||||
|
return False |
||||
|
else: |
||||
|
return True |
||||
|
|
||||
|
|
||||
|
def Crawl(self): |
||||
|
# 这个方法是实际完成爬取工作的总入口。 |
||||
|
|
||||
|
# 爬取浙江政采网的信息 |
||||
|
print('开始获取浙江政采网的信息\n') |
||||
|
|
||||
|
# 定义要传递进去的关于公告信息类型的数据结构 |
||||
|
infoType = [ |
||||
|
{"announcementCode": "110-175885", "announcementType":"采购意向"}, |
||||
|
{"announcementCode": "110-978863", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "110-943756", "announcementType":"更正公告"}, |
||||
|
{"announcementCode": "110-420383", "announcementType":"非政府采购公告"}, |
||||
|
{"announcementCode": "110-900461", "announcementType":"结果公告"} |
||||
|
] |
||||
|
for typeParam in infoType: |
||||
|
for page in range(1, 11): |
||||
|
try: |
||||
|
self.CrawlPage_zfcg_czt_zj(page, typeParam) |
||||
|
except Exception as e: |
||||
|
print('3--------------------------------', e) |
||||
|
|
||||
|
# 爬取宁波市阳光采购网的信息 |
||||
|
print('开始获取宁波市阳光采购网的信息\n') |
||||
|
infoType = [ |
||||
|
{"announcementCode": "21", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "23", "announcementType":"更正公告"}, |
||||
|
{"announcementCode": "22", "announcementType":"结果公告"} |
||||
|
] |
||||
|
for typeParam in infoType: |
||||
|
try: |
||||
|
self.CrawlPage_ygcg_nbcqjy_org(2, typeParam) |
||||
|
except Exception as e: |
||||
|
print('4--------------------------------', e) |
||||
|
|
||||
|
# 爬取宁波市中介超市网的信息 |
||||
|
print('开始获取宁波市中介超市网的信息\n') |
||||
|
infoType = [ |
||||
|
{"announcementCode": '1', "announcementType":"项目需求公告"}, |
||||
|
{"announcementCode": '2', "announcementType":"结果公告"} |
||||
|
] |
||||
|
|
||||
|
for typeParam in infoType: |
||||
|
for page in range(1, 6): |
||||
|
try: |
||||
|
self.CrawlPage_zjcs_nbxzfw(page, typeParam) |
||||
|
except Exception as e: |
||||
|
print('5------------------------------', e) |
||||
|
|
||||
|
# 爬取宁波市国资委市属企业采购信息 |
||||
|
print('开始获取宁波市国资委市属企业招投标网的信息\n') |
||||
|
for page in range(1, 5): |
||||
|
try: |
||||
|
self.CrawlPage_gzw_ningbo(page) |
||||
|
except Exception as e: |
||||
|
print('6------------------------------', e) |
||||
|
|
||||
|
# 爬取宁波中基国际招标网的信息 |
||||
|
print('开始获取宁波中基国际招标网的信息\n') |
||||
|
infoType = [ |
||||
|
{"announcementCode": "22", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "23", "announcementType":"结果公告"} |
||||
|
] |
||||
|
|
||||
|
for typeParam in infoType: |
||||
|
for page in range(1, 6): |
||||
|
try: |
||||
|
self.CrawlPage_cbbidding(page, typeParam) |
||||
|
except Exception as e: |
||||
|
print('7--------------------------------', e) |
||||
|
|
||||
|
# 爬取浙江国际招标网的信息 |
||||
|
print('开始获取浙江国际招标网的信息\n') |
||||
|
infoType = [ |
||||
|
{"announcementCode": "Zbgg", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "Gzgg", "announcementType":"更正公告"}, |
||||
|
{"announcementCode": "jggg", "announcementType":"结果公告"} |
||||
|
] |
||||
|
|
||||
|
for typeParam in infoType: |
||||
|
for page in range(1, 5): |
||||
|
try: |
||||
|
self.CrawlPage_zmeetb(page, typeParam) |
||||
|
except Exception as e: |
||||
|
print('8----------------------------', e) |
||||
|
|
||||
|
|
||||
|
# 爬取宁波市国际招标有限公司网站 |
||||
|
print('开始获取宁波国际招标网的信息\n') |
||||
|
|
||||
|
# 定义要传递进去的关于公告信息类型的数据结构 |
||||
|
infoType = [ |
||||
|
{"announcementCode": "1", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "1", "announcementType":"结果公告"}, |
||||
|
{"announcementCode": "2", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "2", "announcementType":"结果公告"} |
||||
|
] |
||||
|
for typeParam in infoType: |
||||
|
for page in range(1, 5): |
||||
|
try: |
||||
|
self.CrawlPage_nbbidding(page, typeParam) |
||||
|
except Exception as e: |
||||
|
print('9--------------------------------', e) |
||||
|
|
||||
|
# 爬取宁波名诚招标代理有限公司网站 |
||||
|
print('开始获取宁波名城招标的信息\n') |
||||
|
|
||||
|
# 定义要传递进去的关于公告信息类型的数据结构 |
||||
|
infoType = [ |
||||
|
{"announcementCode": "99", "announcementType":"采购公告"}, |
||||
|
{"announcementCode": "88", "announcementType":"结果公告"} |
||||
|
] |
||||
|
for typeParam in infoType: |
||||
|
for page in range(1, 2): |
||||
|
try: |
||||
|
self.CrawlPage_nbmcbidding(page, typeParam) |
||||
|
except Exception as e: |
||||
|
print('10--------------------------------', e) |
||||
|
|
||||
|
|
||||
|
# 宁波中基国际招标有限公司 https://www.cbbidding.com/ |
||||
|
def CrawlPage_cbbidding(self, page, typeParam): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
session = HTMLSession() |
||||
|
session.DEFAULT_RETRIES = 5 |
||||
|
url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page) |
||||
|
|
||||
|
headers = { |
||||
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
||||
|
"Accept-Encoding": "gzip, deflate", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"Connection": "keep-alive", |
||||
|
"DNT": '1', |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
||||
|
} |
||||
|
|
||||
|
|
||||
|
# 这个网站返回的是一个网页,所以需要进行网页解析 |
||||
|
r = session.get(url = url, headers = headers) |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text) |
||||
|
return False |
||||
|
|
||||
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
||||
|
data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li') |
||||
|
for item in data: |
||||
|
title = item.xpath('//a')[0].text |
||||
|
url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href') |
||||
|
region = '中基招标' |
||||
|
publishDate = item.xpath('//div')[0].text |
||||
|
|
||||
|
try: |
||||
|
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d')) |
||||
|
except Exception as e: |
||||
|
publishDate = publishDate.replace('.', '-') |
||||
|
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d')) |
||||
|
|
||||
|
print(url, title) |
||||
|
announcementType = typeParam['announcementType'] |
||||
|
#print(title, url, region, publishDate, announcementType) |
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
|
||||
|
# 浙江国际招投标有限公司 https://www.zmeetb.com/ |
||||
|
def CrawlPage_zmeetb(self, page, typeParam): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
session = HTMLSession() |
||||
|
url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/' + str(page) + '.html' |
||||
|
|
||||
|
headers = { |
||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", |
||||
|
"Accept-Encoding": "gzip, deflate, br", |
||||
|
"Cache-Control": "max-age=0", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"Connection": "close", |
||||
|
"DNT": '1', |
||||
|
"Host": "www.zmeetb.com", |
||||
|
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"', |
||||
|
"sec-ch-ua-mobile": "?0", |
||||
|
"sec-ch-ua-platform": "Windows", |
||||
|
"Sec-Fetch-Dest": "document", |
||||
|
"Sec-Fetch-Mode": "navigate", |
||||
|
"Sec-Fetch-Site": "none", |
||||
|
"Sec-Fetch-User": "?1", |
||||
|
"Upgrade-Insecure-Requests": "1", |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
||||
|
} |
||||
|
|
||||
|
# 这个网站返回的是一个网页,所以需要进行网页解析 |
||||
|
# 这个网站如果使用render()函数,会遇到ssl证书问题,需要进一步研究chromium浏览器的证书问题 |
||||
|
#r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com') |
||||
|
r = session.get(url = url, headers = headers, verify=False) |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text) |
||||
|
return False |
||||
|
|
||||
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
||||
|
data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a') |
||||
|
for item in data: |
||||
|
title = item.xpath('//p')[0].text |
||||
|
url = item.attrs.get('href') |
||||
|
region = '浙江国际招标' |
||||
|
publishDate = item.xpath('//p')[1].text |
||||
|
announcementType = typeParam['announcementType'] |
||||
|
|
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
|
||||
|
# 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/ |
||||
|
def CrawlPage_nbmcbidding(self, page, typeParam): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
session = HTMLSession() |
||||
|
if typeParam['announcementType'] == '采购公告': |
||||
|
url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/" |
||||
|
else: |
||||
|
url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/" |
||||
|
|
||||
|
|
||||
|
data = {} |
||||
|
headers = { |
||||
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", |
||||
|
"Host": "www.nbmcbidding.com", |
||||
|
'Connection': 'keep-alive', |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
||||
|
} |
||||
|
|
||||
|
r = session.get(url = url, headers = headers, json = data) |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text) |
||||
|
return False |
||||
|
|
||||
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
||||
|
data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li') |
||||
|
for item in data: |
||||
|
title = item.xpath('//a/div[2]')[0].text |
||||
|
url = item.xpath('//a')[0].attrs.get('href') |
||||
|
region = '宁波名诚招标' |
||||
|
publishDate = item.xpath('//a/div[4]')[0].text |
||||
|
announcementType = typeParam['announcementType'] |
||||
|
|
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
|
||||
|
# 宁波市国际招标有限公司 http://www.nbbidding.com/ |
||||
|
def CrawlPage_nbbidding(self, page, typeParam): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
session = HTMLSession() |
||||
|
if typeParam['announcementType'] == '采购公告': |
||||
|
url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword" |
||||
|
|
||||
|
else: |
||||
|
url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword" |
||||
|
|
||||
|
|
||||
|
data = {} |
||||
|
headers = { |
||||
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
||||
|
"Host": "www.nbbidding.com", |
||||
|
'Connection': 'keep-alive', |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
||||
|
} |
||||
|
|
||||
|
r = session.get(url = url, headers = headers, json = data) |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text) |
||||
|
return False |
||||
|
|
||||
|
data = json.loads(r.text)['data'] |
||||
|
total = data['page']['count'] |
||||
|
data = data['list'] |
||||
|
|
||||
|
for item in data: |
||||
|
id = item['id'] |
||||
|
if typeParam['announcementType'] == '采购公告': |
||||
|
url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id) |
||||
|
else: |
||||
|
url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id) |
||||
|
title = item['title'] |
||||
|
region = '宁波国际招标' |
||||
|
publishDate = item['addtime'] |
||||
|
announcementType = item['stage'] |
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
print(publishDate, title, url) |
||||
|
|
||||
|
# 宁波市国资委属企业招标信息网 |
||||
|
def CrawlPage_gzw_ningbo(self, page): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
session = HTMLSession() |
||||
|
url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page) |
||||
|
|
||||
|
headers = { |
||||
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
||||
|
"Accept-Encoding": "gzip, deflate", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"Connection": "keep-alive", |
||||
|
"DNT": '1', |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
||||
|
} |
||||
|
|
||||
|
# 这个网站返回的是一个网页,所以需要进行网页解析 |
||||
|
r = session.get(url = url, headers = headers) |
||||
|
r.html.render() |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text) |
||||
|
return False |
||||
|
|
||||
|
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
||||
|
data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li') |
||||
|
for item in data: |
||||
|
title = item.xpath('//a')[0].text |
||||
|
url = item.xpath('//a')[0].attrs.get('href') |
||||
|
region = '宁波市属国企' |
||||
|
publishDate = item.xpath('//p')[0].text |
||||
|
announcementType = '采购公告' |
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
|
||||
|
|
||||
|
# 宁波市中介超市网 |
||||
|
def CrawlPage_zjcs_nbxzfw(self, page, typeParam): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
# type 用于判别采购信息的类型 |
||||
|
session = HTMLSession() |
||||
|
urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15'] |
||||
|
|
||||
|
headers = { |
||||
|
"Accept": "application/json, text/javascript, */*; q=0.01", |
||||
|
"Accept-Encoding": "gzip, deflate", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"Connection": "keep-alive", |
||||
|
"DNT": '1', |
||||
|
"Host": "ygcg.nbcqjy.org", |
||||
|
"Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type), |
||||
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
||||
|
} |
||||
|
|
||||
|
for url in urllist: |
||||
|
r = session.get(url = url, headers = headers) |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) |
||||
|
return False |
||||
|
|
||||
|
data = json.loads(r.text)['data'] |
||||
|
|
||||
|
total = data['total'] |
||||
|
data = data['rows'] |
||||
|
|
||||
|
for item in data: |
||||
|
articleId = item['AutoId'] |
||||
|
BulletinTypeId = item['BulletinTypeId'] |
||||
|
url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId) |
||||
|
title = item['BulletinTitle'] |
||||
|
region = '宁波中介超市' |
||||
|
publishDate = item['PublishDate'] |
||||
|
announcementType = typeParam['announcementType'] |
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
#print(publishDate, url) |
||||
|
|
||||
|
|
||||
|
# 宁波阳光采购网 |
||||
|
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): |
||||
|
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode'] |
||||
|
|
||||
|
wait_for = '.ant-pagination-item-ellipsis' |
||||
|
page_element = '.anticon-right' |
||||
|
try: |
||||
|
r = Splash().post(url, wait_for, pages=pages, page_element=page_element) |
||||
|
except Exception as e: |
||||
|
print(e) |
||||
|
|
||||
|
results = json.loads(r.text) |
||||
|
|
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码:'+str(r.status_code), r.text) |
||||
|
return False |
||||
|
|
||||
|
for i in range(1, pages + 1): |
||||
|
data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li') |
||||
|
if len(data) == 0: |
||||
|
print('数据为空') |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e) |
||||
|
return False |
||||
|
|
||||
|
for item in data: |
||||
|
url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href') |
||||
|
title = item.xpath('//a/span[3]')[0].text |
||||
|
region = '宁波阳光采购' |
||||
|
publishDate = item.xpath('//div[2]')[0].text |
||||
|
announcementType = typeParam['announcementType'] |
||||
|
print(title) |
||||
|
self.write_information(title, url, region, publishDate, announcementType) |
||||
|
|
||||
|
|
||||
|
# 浙江政府采购网 |
||||
|
def CrawlPage_zfcg_czt_zj(self, page, typeParam): |
||||
|
# 这个方法是实际爬取指定页面的信息。 |
||||
|
session = HTMLSession() |
||||
|
url = 'https://zfcg.czt.zj.gov.cn/portal/category' |
||||
|
if typeParam['announcementCode'] == '110-420383': |
||||
|
data = { |
||||
|
"pageNo": page, |
||||
|
"pageSize": 15, |
||||
|
"categoryCode": typeParam['announcementCode'], |
||||
|
"districtCode": ["339900"], |
||||
|
"isProvince": True, |
||||
|
"includeGovDistrict": "1", |
||||
|
"_t": 1699104836000 |
||||
|
} |
||||
|
else: |
||||
|
data = { |
||||
|
"pageNo": page, |
||||
|
"pageSize": 15, |
||||
|
"categoryCode": typeParam['announcementCode'], |
||||
|
"isGov": True, |
||||
|
"excludeDistrictPrefix": "90", |
||||
|
"_t": 1699104836000 |
||||
|
} |
||||
|
|
||||
|
headers = { |
||||
|
"accept": "application/json, text/plain, */*", |
||||
|
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8", |
||||
|
"content-type": "application/json;charset=UTF-8", |
||||
|
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"", |
||||
|
"sec-ch-ua-mobile": "?0", |
||||
|
"sec-ch-ua-platform": "\"Windows\"", |
||||
|
"sec-fetch-dest": "empty", |
||||
|
"sec-fetch-mode": "cors", |
||||
|
"sec-fetch-site": "same-origin", |
||||
|
"x-requested-with": "XMLHttpRequest" |
||||
|
} |
||||
|
|
||||
|
try: |
||||
|
r = session.post(url = url, headers = headers, json = data) |
||||
|
except Exception as e: |
||||
|
print('10-------------------------', e) |
||||
|
|
||||
|
if r.status_code != 200: |
||||
|
if page == 1: |
||||
|
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text) |
||||
|
return False |
||||
|
|
||||
|
data = json.loads(r.text)['result']['data'] |
||||
|
total = data['total'] |
||||
|
data = data['data'] |
||||
|
|
||||
|
for item in data: |
||||
|
publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000) |
||||
|
pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c' |
||||
|
detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId'] |
||||
|
announcementType = typeParam['announcementType'] |
||||
|
if announcementType == '采购意向': |
||||
|
r = session.get(url = detailUrl, headers = headers) |
||||
|
|
||||
|
detailData = json.loads(r.text)['result']['data'] |
||||
|
if detailData == None: |
||||
|
break |
||||
|
|
||||
|
content = HTML(html='<xml>'+detailData['content']+'</xml>') |
||||
|
region = item['districtName'] |
||||
|
for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'): |
||||
|
title = detailItem.xpath('//td[2]')[0].text |
||||
|
cgxqqk = detailItem.xpath('//td[3]')[0].text |
||||
|
ysje = detailItem.xpath('//td[4]')[0].text |
||||
|
yjcgsj = detailItem.xpath('//td[5]')[0].text |
||||
|
ly = detailData["title"] |
||||
|
|
||||
|
self.write_information(title, pageUrl, region, publishDate, announcementType) |
||||
|
self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly}) |
||||
|
else: |
||||
|
title = item['title'] |
||||
|
region = item['districtName'] |
||||
|
self.write_information(title, pageUrl, region, publishDate, announcementType) |
||||
|
|
||||
|
#print(publishDate, url) |
||||
|
|
||||
|
|
||||
|
return True |
@ -0,0 +1,137 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
|
||||
|
import pymysql |
||||
|
from properties import Properties |
||||
|
import sys, getopt |
||||
|
|
||||
|
class DbSearch: |
||||
|
# 本类用于提供各类数据库信息搜索服务 |
||||
|
def __init__(self, connect): |
||||
|
self.connect = connect |
||||
|
|
||||
|
def GetTableList(self, database): |
||||
|
# 查询某个库的数据表的列表 |
||||
|
cursorTable = self.connect.cursor() |
||||
|
cursorTable.execute("SELECT table_name FROM INFORMATION_SCHEMA.TABLES where table_schema = '" + database + "'"); |
||||
|
|
||||
|
return cursorTable.fetchall() |
||||
|
|
||||
|
def GetColumnList(self, tableName): |
||||
|
# 查询某张表的数据字段列表 |
||||
|
cursorColumn = self.connect.cursor() |
||||
|
cursorColumn.execute("SELECT column_name,data_type FROM INFORMATION_SCHEMA.COLUMNS where table_schema='" + database + "' AND table_name='" + |
||||
|
tableName + "'"); |
||||
|
return cursorColumn.fetchall() |
||||
|
|
||||
|
def SearchTableByColumnName(self, columnName, database): |
||||
|
# 查询包含包含searchText的库表 |
||||
|
tableList = self.GetTableList(database) |
||||
|
findList = list() |
||||
|
for table in tableList: |
||||
|
columnList = self.GetColumnList(table[0]) |
||||
|
for column in columnList: |
||||
|
if column[0].find(columnName) != -1: |
||||
|
findList.append(table[0]) |
||||
|
|
||||
|
return findList |
||||
|
|
||||
|
def SearchTableByText(self, searchText, database): |
||||
|
# 查找包含searchText字符串的表,并显示相应的表记录 |
||||
|
tableList = self.GetTableList(database) |
||||
|
if len(tableList) == 0: |
||||
|
return False |
||||
|
|
||||
|
found = 0 |
||||
|
findList = list() |
||||
|
for table in tableList: |
||||
|
strSql = "SELECT '" + table[0] + "' as table_name, t.* " |
||||
|
strSql = strSql + " FROM " + database + "." + table[0] + " as t where " + "(" |
||||
|
|
||||
|
columnList = self.GetColumnList(table[0]) |
||||
|
i = 0 |
||||
|
|
||||
|
count = len(columnList) |
||||
|
|
||||
|
for column in columnList: |
||||
|
# 如果字段数据类型为非文本型,跳过 |
||||
|
if not column[1] in ('varchar', 'char', 'text'): |
||||
|
continue |
||||
|
i += 1 |
||||
|
|
||||
|
if i > 1: |
||||
|
strSql += " or " |
||||
|
strSql += column[0] + " like '%" + searchText + "%' " |
||||
|
|
||||
|
strSql += ")" |
||||
|
|
||||
|
cursorColumn = self.connect.cursor() |
||||
|
try: |
||||
|
cursorColumn.execute(strSql) |
||||
|
except Exception as e: |
||||
|
print('2----------------------------', database, strSql) |
||||
|
print("-----错误信息:-----\n", e) |
||||
|
return False |
||||
|
|
||||
|
result = cursorColumn.fetchall() |
||||
|
if len(result) > 0: |
||||
|
findList.append(table[0]) |
||||
|
print("==========================================================================") |
||||
|
print(table[0], result, strSql) |
||||
|
return findList |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
print( |
||||
|
""" |
||||
|
============================================================ |
||||
|
|这是数据库全文检索工具,包含两个参数 | |
||||
|
============================================================ |
||||
|
""") |
||||
|
|
||||
|
# 设置运行环境。如果当前是测试环境,则将is_test设置为true |
||||
|
is_test = False |
||||
|
|
||||
|
if is_test: |
||||
|
file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties" |
||||
|
database = 'guoyantest' |
||||
|
else: |
||||
|
file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties" |
||||
|
database = 'guoyan' |
||||
|
|
||||
|
# 打开jdbc.properties文件,获取数据库的配置信息 |
||||
|
props = Properties(file_path) |
||||
|
host = 'localhost' |
||||
|
user = props.get('jdbc.username') |
||||
|
password = props.get('jdbc.password') |
||||
|
|
||||
|
# 打开数据连接 |
||||
|
db = pymysql.connect(host = host, user = user, password = password, database = database) |
||||
|
|
||||
|
# 获取命令行参数 |
||||
|
keyword = '' |
||||
|
searchType ='' |
||||
|
|
||||
|
keyword = '' |
||||
|
searchType = '' |
||||
|
try: |
||||
|
opts, args = getopt.getopt(sys.argv[1:],"hT:k:",["keyword=","searchType="]) |
||||
|
except getopt.GetoptError: |
||||
|
print(sys.argv[0] + ' -k <keyword> -T <searchType>') |
||||
|
sys.exit(2) |
||||
|
|
||||
|
for opt, arg in opts: |
||||
|
if opt == '-h': |
||||
|
print('3--------------------', 'test.py -k <keyword> -T <searchType>') |
||||
|
sys.exit() |
||||
|
elif opt in ("-k", "--keyword"): |
||||
|
keyword = arg |
||||
|
elif opt in ("-T", "--searchType"): |
||||
|
searchType = arg |
||||
|
dbSearch = DbSearch(db) |
||||
|
if searchType == '0': |
||||
|
print('正在根据您输入的关键词查找表.....................') |
||||
|
print('found tables: ', dbSearch.SearchTableByText(keyword, database)) |
||||
|
elif searchType == '1': |
||||
|
print('正在根据您输入的列名查找表.....................') |
||||
|
print('found tables: ', dbSearch.SearchTableByColumnName(keyword, database)) |
||||
|
|
||||
|
|
@ -0,0 +1,59 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
"""这是爬虫的主程序主程序 |
||||
|
作者:陈进钱 |
||||
|
日期:2023/11/03 |
||||
|
""" |
||||
|
|
||||
|
import pymysql |
||||
|
import datetime |
||||
|
import time |
||||
|
from apscheduler.schedulers.blocking import BlockingScheduler |
||||
|
from properties import Properties |
||||
|
from crawler import Crawler |
||||
|
|
||||
|
print( |
||||
|
"""采购信息采集器 v1.0 |
||||
|
=================================================================================== |
||||
|
这个程序用于获取各大招投标网站的采购信息 |
||||
|
version: 1.0 |
||||
|
作者:陈进钱 |
||||
|
日期:2023-11-04 |
||||
|
===================================================================================""") |
||||
|
|
||||
|
# 设置运行环境。如果当前是测试环境,则将is_test设置为true |
||||
|
is_test = False |
||||
|
|
||||
|
if is_test: |
||||
|
file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties" |
||||
|
database = 'guoyantest' |
||||
|
else: |
||||
|
file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties" |
||||
|
database = 'guoyan' |
||||
|
|
||||
|
# 打开jdbc.properties文件,获取数据库的配置信息 |
||||
|
props = Properties(file_path) |
||||
|
host = 'localhost' |
||||
|
user = props.get('jdbc.username') |
||||
|
password = props.get('jdbc.password') |
||||
|
|
||||
|
# 打开数据连接 |
||||
|
connect = pymysql.connect(host = host, user = user, password = password, database = database) |
||||
|
|
||||
|
# 获取采购信息,并填写到数据库中 |
||||
|
crawler = Crawler(connect) |
||||
|
|
||||
|
# 启动自动爬取任务 |
||||
|
def crawl_check_func(): |
||||
|
crawler.Check() |
||||
|
|
||||
|
# 启动自动爬取任务 |
||||
|
def crawl_job_func(): |
||||
|
crawler.Crawl() |
||||
|
|
||||
|
sched = BlockingScheduler() |
||||
|
sched.add_job(crawl_job_func, 'interval', hours=3, jitter=120, max_instances=4) |
||||
|
sched.add_job(crawl_check_func, 'interval', days=1, jitter=120, max_instances=4) |
||||
|
sched.start() |
||||
|
|
||||
|
# 关闭数据库连接 |
||||
|
connect.close() |
@ -0,0 +1,44 @@ |
|||||
|
import smtplib |
||||
|
from email.mime.text import MIMEText |
||||
|
from email.header import Header |
||||
|
|
||||
|
def SendMail(sender, receiver, subject, message): |
||||
|
# 发送邮件服务器 |
||||
|
smtp_server = 'smtp.126.com' |
||||
|
|
||||
|
# 发送邮件服务器端口 |
||||
|
smtp_port = 465 |
||||
|
|
||||
|
# 邮件对象 |
||||
|
msg = MIMEText(message, 'plain', 'utf-8') |
||||
|
msg['From'] = Header(sender, 'utf-8') |
||||
|
msg['To'] = Header(receiver, 'utf-8') |
||||
|
msg['Subject'] = Header(subject, 'utf-8') |
||||
|
|
||||
|
# SMTP对象 |
||||
|
smtpObj = smtplib.SMTP_SSL(smtp_server, smtp_port) |
||||
|
|
||||
|
# 登录SMTP服务器 |
||||
|
smtpObj.login(sender, 'ERXYFJRLKPTTDXWH') |
||||
|
|
||||
|
# 发送邮件 |
||||
|
smtpObj.sendmail(from_addr=sender,to_addrs=[receiver],msg=msg.as_string()) |
||||
|
|
||||
|
# 关闭SMTP连接 |
||||
|
smtpObj.quit() |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
|
||||
|
# 发件人邮箱 |
||||
|
sender = 'jinqian_chen@126.com' |
||||
|
|
||||
|
# 收件人邮箱 |
||||
|
receiver = 'jinqian.chen@srit.com.cn' |
||||
|
|
||||
|
# 邮件主题 |
||||
|
subject = 'Python3发送邮件示例, new' |
||||
|
|
||||
|
# 邮件正文 |
||||
|
message = '这是一封Python3发送的邮件' |
||||
|
SendMail(sender, receiver, subject, message) |
||||
|
|
@ -0,0 +1,46 @@ |
|||||
|
#Db2 |
||||
|
#hibernate.dialect=org.hibernate.dialect.DB2Dialect |
||||
|
#jdbc.driverClassName=com.ibm.db2.jcc.DB2Driver |
||||
|
#jdbc.url=jdbc:db2://localhost:50000/eaching |
||||
|
|
||||
|
#Oracle |
||||
|
#hibernate.dialect=org.hibernate.dialect.Oracle10gDialect |
||||
|
#jdbc.driverClassName=oracle.jdbc.driver.OracleDriver |
||||
|
#jdbc.url=jdbc:oracle:thin:@47.99.208.214:1521:orcl |
||||
|
#jdbc.url=jdbc:oracle:thin:@118.190.161.36:1521:orcl |
||||
|
|
||||
|
#SqlServer |
||||
|
#hibernate.dialect=org.hibernate.dialect.SQLServerDialect |
||||
|
#jdbc.driverClassName=net.sourceforge.jtds.jdbc.Driver |
||||
|
#jdbc.url=jdbc:jtds:sqlserver://localhost:1433/guanwaimatou;SelectMethod=Cursor |
||||
|
|
||||
|
|
||||
|
#MySql |
||||
|
hibernate.dialect=org.hibernate.dialect.MySQLDialect |
||||
|
jdbc.driverClassName=com.mysql.jdbc.Driver |
||||
|
jdbc.url=jdbc:mysql://116.62.210.190:3306/guoyantest?autoReconnect=true&useUnicode=true&characterEncoding=UTF8&mysqlEncoding=utf8&zeroDateTimeBehavior=convertToNull |
||||
|
|
||||
|
jdbc.username=root |
||||
|
jdbc.password=Guoyan83086775 |
||||
|
|
||||
|
jdbc.maxConn=20 |
||||
|
jdbc.minConn=5 |
||||
|
jdbc.activeTime=900000 |
||||
|
jdbc.alias=eaching |
||||
|
jdbc.keepingSleepTime=30000 |
||||
|
jdbc.maxConnectionLifetime=60000 |
||||
|
|
||||
|
jdbc.multiSource=false |
||||
|
|
||||
|
hibernate.cache.use_second_level_cache=true |
||||
|
hibernate.show_sql=false |
||||
|
hibernate.generate_statistics=false |
||||
|
hibernate.cache.provider_class=org.hibernate.cache.EhCacheProvider |
||||
|
#hibernate.cache.provider_class=net.oschina.j2cache.hibernate3.J2CacheProvider |
||||
|
hibernate.cache.use_minimal_puts=true |
||||
|
hibernate.cache.use_structured_entries=true |
||||
|
hibernate.cache.use_query_cache=true |
||||
|
hibernate.use_sql_comments=trues |
||||
|
hibernate.order_updates=true |
||||
|
hibernate.format_sql=false |
||||
|
hbm2ddl.auto=create |
@ -0,0 +1,75 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
"""这是爬虫的主程序主程序 |
||||
|
作者:陈进钱 |
||||
|
日期:2023/11/03 |
||||
|
""" |
||||
|
|
||||
|
import pymysql |
||||
|
import datetime |
||||
|
import time |
||||
|
from apscheduler.schedulers.blocking import BlockingScheduler |
||||
|
from properties import Properties |
||||
|
from crawler import Crawler |
||||
|
import sys |
||||
|
import os |
||||
|
|
||||
|
print( |
||||
|
"""采购信息采集器 v1.0 |
||||
|
=================================================================================== |
||||
|
这个程序用于获取各大招投标网站的采购信息 |
||||
|
version: 1.0 |
||||
|
作者:陈进钱 |
||||
|
日期:2023-11-04 |
||||
|
===================================================================================""") |
||||
|
|
||||
|
# 设置运行环境。如果当前是测试环境,则将is_test设置为true |
||||
|
is_test = True |
||||
|
if is_test: |
||||
|
root = "/opt/eresource_test/webapp/WEB-INF/classes/prod/" |
||||
|
else: |
||||
|
root = "/opt/eresource/webapp/WEB-INF/classes/prod/" |
||||
|
|
||||
|
if os.path.exists(root): |
||||
|
file_path = root + "jdbc.properties" |
||||
|
else: |
||||
|
file_path = "jdbc.properties" |
||||
|
|
||||
|
if sys.platform == 'win32': |
||||
|
host = '116.62.210.190' |
||||
|
user = 'root' |
||||
|
password = 'Guoyan83086775' |
||||
|
if is_test: |
||||
|
database = 'guoyantest' |
||||
|
else: |
||||
|
database = 'guoyan' |
||||
|
else: |
||||
|
if is_test: |
||||
|
database = 'guoyantest' |
||||
|
else: |
||||
|
database = 'guoyan' |
||||
|
|
||||
|
# 打开jdbc.properties文件,获取数据库的配置信息 |
||||
|
props = Properties(file_path) |
||||
|
host = '116.62.210.190' |
||||
|
user = props.get('jdbc.username') |
||||
|
password = props.get('jdbc.password') |
||||
|
|
||||
|
# 打开数据连接 |
||||
|
connect = pymysql.connect(host = host, user = user, password = password, database = database) |
||||
|
|
||||
|
# 获取采购信息,并填写到数据库中 |
||||
|
crawler = Crawler(connect) |
||||
|
crawler.Crawl() |
||||
|
#crawler.CrawlPage_ygcg_nbcqjy_org(1, {"announcementCode": "21", "announcementType":"采购公告"}) |
||||
|
#print(crawler.Check()) |
||||
|
|
||||
|
# 启动自动爬取任务 |
||||
|
#def crawl_job_func(): |
||||
|
# crawler.Crawl() |
||||
|
|
||||
|
#sched = BlockingScheduler() |
||||
|
#sched.add_job(crawl_job_func, 'interval', hours=1, jitter=120) |
||||
|
#sched.start() |
||||
|
|
||||
|
# 关闭数据库连接 |
||||
|
connect.close() |
Binary file not shown.
@ -0,0 +1,72 @@ |
|||||
|
#!/usr/bin/python |
||||
|
# -*- coding: UTF-8 -*- |
||||
|
|
||||
|
import re |
||||
|
import os |
||||
|
import tempfile |
||||
|
|
||||
|
|
||||
|
class Properties: |
||||
|
|
||||
|
def __init__(self, file_name): |
||||
|
# 如果配置文件不存在,取本地文件 |
||||
|
if not os.path.exists(file_name): |
||||
|
file_name = 'jdbc.properties' |
||||
|
|
||||
|
self.file_name = file_name |
||||
|
self.properties = {} |
||||
|
try: |
||||
|
fopen = open(self.file_name, 'r') |
||||
|
for line in fopen: |
||||
|
line = line.strip() |
||||
|
if line.find('=') > 0 and not line.startswith('#'): |
||||
|
strs = line.split('=') |
||||
|
self.properties[strs[0].strip()] = strs[1].strip() |
||||
|
except Exception as e: |
||||
|
raise e |
||||
|
else: |
||||
|
fopen.close() |
||||
|
|
||||
|
def has_key(self, key): |
||||
|
return key in self.properties |
||||
|
|
||||
|
def get(self, key, default_value=''): |
||||
|
if key in self.properties: |
||||
|
return self.properties[key] |
||||
|
return default_value |
||||
|
|
||||
|
def put(self, key, value): |
||||
|
self.properties[key] = value |
||||
|
replace_property(self.file_name, key + '=.*', key + '=' + value, True) |
||||
|
|
||||
|
|
||||
|
def replace_property(file_name, from_regex, to_str, append_on_not_exists=True): |
||||
|
tmpfile = tempfile.TemporaryFile() |
||||
|
|
||||
|
if os.path.exists(file_name): |
||||
|
r_open = open(file_name, 'r') |
||||
|
pattern = re.compile(r'' + from_regex) |
||||
|
found = None |
||||
|
for line in r_open: |
||||
|
if pattern.search(line) and not line.strip().startswith('#'): |
||||
|
found = True |
||||
|
line = re.sub(from_regex, to_str, line) |
||||
|
tmpfile.write(line.encode()) |
||||
|
if not found and append_on_not_exists: |
||||
|
tmpfile.write(('\n' + to_str).encode()) |
||||
|
r_open.close() |
||||
|
tmpfile.seek(0) |
||||
|
|
||||
|
content = tmpfile.read() |
||||
|
|
||||
|
if os.path.exists(file_name): |
||||
|
os.remove(file_name) |
||||
|
|
||||
|
w_open = open(file_name, 'wb') |
||||
|
w_open.write(content) |
||||
|
w_open.close() |
||||
|
|
||||
|
tmpfile.close() |
||||
|
else: |
||||
|
print ("file %s not found" % file_name) |
||||
|
|
@ -0,0 +1,14 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
from splash.gysplash import SBase |
||||
|
import json |
||||
|
|
||||
|
class SYgcg(SBase): |
||||
|
def open(self): |
||||
|
return super().open('ygcg', pages=2, annoucement_type='政府采购') |
||||
|
|
||||
|
if __name__ == '__main__': |
||||
|
test = SYgcg() |
||||
|
r = test.open() |
||||
|
|
||||
|
results = json.loads(r.text) |
||||
|
print(results) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,7 @@ |
|||||
|
#This is splash config file. |
||||
|
[splash service settings] |
||||
|
server = localhost |
||||
|
port = 8050 |
||||
|
|
||||
|
|
||||
|
|
@ -0,0 +1,20 @@ |
|||||
|
{ |
||||
|
"description": "This is splash config file.", |
||||
|
"server": "127.0.0.1", |
||||
|
"port": "8050", |
||||
|
"class":{ |
||||
|
"SYgcg":{ |
||||
|
"url":"https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType={{ $noticeType }}", |
||||
|
"_comment":"http://www.baidu.com", |
||||
|
"param":{ |
||||
|
"noticeType":"21" |
||||
|
}, |
||||
|
"wait_for":".ant-list-items", |
||||
|
"page_element":".anticon-right", |
||||
|
"headers":{ |
||||
|
"content-type":"application/json", |
||||
|
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0" |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
@ -0,0 +1 @@ |
|||||
|
This is lua script file |
@ -0,0 +1,122 @@ |
|||||
|
#!/usr/bin/python3 |
||||
|
'''=================================================================== |
||||
|
这个模块用于对 splash 服务进行封装,方便在 python 中使用。 |
||||
|
版本:1.0 |
||||
|
作者:陈进钱 |
||||
|
日期:2023-12-18 |
||||
|
===================================================================''' |
||||
|
import os |
||||
|
import re |
||||
|
import json |
||||
|
import codecs |
||||
|
import configparser |
||||
|
from requests_html import HTMLSession |
||||
|
from requests_html import HTML |
||||
|
|
||||
|
config = configparser.ConfigParser() |
||||
|
# splash 基类 |
||||
|
class SBase: |
||||
|
def __init__(self): |
||||
|
self.__lua_script = '' |
||||
|
self.config = {} |
||||
|
|
||||
|
# 创建 ConfigParser 对象 |
||||
|
self.root = os.path.dirname(os.path.abspath(__file__)) |
||||
|
|
||||
|
# 自动创建配置文件 |
||||
|
dir = self.root + '/config' |
||||
|
if not os.path.exists(dir): |
||||
|
os.makedirs(dir) |
||||
|
|
||||
|
file_path = self.root + '/config/splash.json' |
||||
|
|
||||
|
if os.path.exists(file_path): |
||||
|
file = codecs.open(file_path, 'r', 'utf-8') |
||||
|
content = file.read() |
||||
|
self.config = json.loads(content) |
||||
|
file.close() |
||||
|
else: |
||||
|
self.config['description'] = 'This is splash config file.' |
||||
|
self.config['server'] = 'localhost' |
||||
|
self.config['port'] = '8050' |
||||
|
|
||||
|
content = json.dumps(self.config) |
||||
|
with codecs.open(file_path, 'w', 'utf-8') as file: |
||||
|
file.write(content) |
||||
|
|
||||
|
# 自动创建空的脚本文件 |
||||
|
dir = self.root + '/scripts' |
||||
|
if not os.path.exists(dir): |
||||
|
os.makedirs(dir) |
||||
|
|
||||
|
# 这个代码要更新为一个通用代码 |
||||
|
file_path = dir + '/main.lua' |
||||
|
|
||||
|
if os.path.exists(file_path): |
||||
|
file = codecs.open(file_path, 'r', 'utf-8') |
||||
|
self.__lua_script = file.read() |
||||
|
file.close() |
||||
|
else: |
||||
|
with codecs.open(file_path, 'w', 'utf-8') as file: |
||||
|
self.__lua_script = 'This is lua script file' |
||||
|
file.write(self.__lua_script) |
||||
|
|
||||
|
def script(self): |
||||
|
return self.__lua_script; |
||||
|
|
||||
|
def class_name(self): |
||||
|
return type(self).__name__; |
||||
|
|
||||
|
def replace(self, source, param, value): |
||||
|
return re.sub('{{[\s]*\$' + param + '[\s]*}}', value, source) |
||||
|
|
||||
|
# 向lua脚本传递参数变量 |
||||
|
def set_params_for_lua(self, scripts, params): |
||||
|
for param in params: |
||||
|
scripts = self.replace(scripts, param, params[param]) |
||||
|
return scripts |
||||
|
|
||||
|
''' |
||||
|
-------------------------------------------------------------------------------------- |
||||
|
本函数用于打开指定的网址。具体的网址、参数、等待就绪的网页元件、等待就绪的翻页元件、 |
||||
|
headers 等参数默认为空。这些参数的任意一个为空时,则从配置文件中的相关类名项下提取。 |
||||
|
本函数会调用 lua 主脚本文件,执行页面解析的lua脚本文件。该文件名称通过参数 parser 传递。 |
||||
|
-------------------------------------------------------------------------------------- |
||||
|
''' |
||||
|
def open(self): |
||||
|
pass |
||||
|
def open(self, scripts_js, pages=1, url='', params=None, |
||||
|
wait_for='', page_element='', headers='', annoucement_type=''): |
||||
|
if url == '': |
||||
|
url = self.config['class'][self.class_name()]['url'] |
||||
|
if params == None: |
||||
|
params = self.config['class'][self.class_name()]['param'] |
||||
|
if len(params) > 0: |
||||
|
for param in params: |
||||
|
url = self.replace(url, param, params[param]) |
||||
|
|
||||
|
if wait_for == '': |
||||
|
wait_for = self.config['class'][self.class_name()]['wait_for'] |
||||
|
|
||||
|
if page_element =='': |
||||
|
page_element = self.config['class'][self.class_name()]['page_element'] |
||||
|
|
||||
|
if headers == '': |
||||
|
headers = self.config['class'][self.class_name()]['headers'] |
||||
|
scripts = self.script() |
||||
|
scripts = self.set_params_for_lua(scripts, { |
||||
|
'pages':str(pages), |
||||
|
'url':url, |
||||
|
'wait_for':wait_for, |
||||
|
'page_element':page_element, |
||||
|
# 这个解析器要从通过参数传递 |
||||
|
'scripts_js': scripts_js, |
||||
|
'announcement_type':annoucement_type |
||||
|
}) |
||||
|
|
||||
|
# print(scripts) |
||||
|
data = json.dumps({'lua_source':scripts}) |
||||
|
splash_url = 'http://' + self.config['server'] + ':' + self.config['port'] + '/execute' |
||||
|
r = HTMLSession().post(splash_url, headers=headers, data=data) |
||||
|
|
||||
|
return r |
@ -0,0 +1,75 @@ |
|||||
|
-- 本文件是页面抓取的主入口 |
||||
|
-- 这里必须采用加载模块的方法,否则好像不能动态加载js文件 |
||||
|
parser = require('parser') |
||||
|
|
||||
|
function main(splash, args) |
||||
|
pages = {{$pages}} |
||||
|
scripts_js = '{{$scripts_js}}' |
||||
|
page_element = '{{$page_element}}' |
||||
|
wait_for = '{{$wait_for}}' |
||||
|
announcement_type = '{{$announcement_type}}' |
||||
|
splash:go('{{$url}}') |
||||
|
wait_for_element(splash, wait_for) |
||||
|
wait_for_element(splash, page_element) |
||||
|
|
||||
|
-- 设置javascript脚本参数 |
||||
|
results = {} |
||||
|
params_js = {} |
||||
|
params_js['announcement_type'] = announcement_type |
||||
|
|
||||
|
-- 将第一页的结果加入返回结果集中 |
||||
|
result = parser.select(splash, scripts_js, params_js) |
||||
|
table.insert(results, result) |
||||
|
|
||||
|
if pages == 1 then |
||||
|
return results |
||||
|
else |
||||
|
-- 执行翻页动作 |
||||
|
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页 |
||||
|
for i = 2, pages do |
||||
|
-- 执行翻页脚本 |
||||
|
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件 |
||||
|
js = string.format("document.querySelector('%s').click();", page_element) |
||||
|
splash:runjs(js) |
||||
|
|
||||
|
-- 等待页面加载完成 |
||||
|
wait_for_element(splash, wait_for) |
||||
|
wait_for_element(splash, page_element) |
||||
|
|
||||
|
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新 |
||||
|
assert(splash:wait(5)) |
||||
|
result = parser.select(splash, scripts_js, params_js) |
||||
|
table.insert(results, result) |
||||
|
end |
||||
|
return results |
||||
|
end |
||||
|
end |
||||
|
|
||||
|
function wait_for_element(splash, css, maxwait) |
||||
|
-- Wait until a selector matches an element |
||||
|
-- in the page. Return an error if waited more |
||||
|
-- than maxwait seconds. |
||||
|
if maxwait == nil then |
||||
|
maxwait = 10 |
||||
|
end |
||||
|
return splash:wait_for_resume(string.format([[ |
||||
|
function main(splash) { |
||||
|
var selector = '%s'; |
||||
|
var maxwait = %s; |
||||
|
var end = Date.now() + maxwait*1000; |
||||
|
|
||||
|
function check() { |
||||
|
if(document.querySelector(selector)) { |
||||
|
splash.resume('Element found'); |
||||
|
} else if(Date.now() >= end) { |
||||
|
var err = 'Timeout waiting for element'; |
||||
|
splash.error(err + " " + selector); |
||||
|
} else { |
||||
|
setTimeout(check, 200); |
||||
|
} |
||||
|
} |
||||
|
check(); |
||||
|
} |
||||
|
]], css, maxwait)) |
||||
|
end |
||||
|
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,28 @@ |
|||||
|
-- 文件名为 module.lua |
||||
|
-- 定义一个名为 module 的模块 |
||||
|
parser = {} |
||||
|
|
||||
|
function set_params(scripts, params_js) |
||||
|
for param, value in pairs(params_js) do |
||||
|
scripts = scripts.gsub(scripts, "{{(%s*)$" .. param .. "(%s*)}}", value) |
||||
|
end |
||||
|
--scripts = scripts.gsub('123456 aaaa 123456', "[\s\\\]*aaaa\\\[\\\\s\\\]*", 'bbbb') |
||||
|
return scripts |
||||
|
end |
||||
|
|
||||
|
-- 定义一个函数 |
||||
|
function parser.select(splash, scripts_js, params_js) |
||||
|
local file = io.open("/etc/splash/lua_modules/jquery-3.7.1.min.js", "r") |
||||
|
splash:runjs(file:read('*a')) |
||||
|
file:close() |
||||
|
|
||||
|
file = assert(io.open("/etc/splash/lua_modules/"..scripts_js..".js", "r")) |
||||
|
scripts = file:read('*a') |
||||
|
scripts = set_params(scripts, params_js) |
||||
|
local js = splash:jsfunc(scripts) |
||||
|
file:close() |
||||
|
|
||||
|
return js() |
||||
|
end |
||||
|
|
||||
|
return parser |
@ -0,0 +1,32 @@ |
|||||
|
function () { |
||||
|
title = ''; |
||||
|
url = ''; |
||||
|
updateTime = ''; |
||||
|
region = ''; |
||||
|
announcementType = ''; |
||||
|
results = {}; |
||||
|
lists = new Array(); |
||||
|
|
||||
|
// 取列表的头
|
||||
|
ul = $('#app > div > div.z_list_vue > div.ant-spin-nested-loading > div > div > div.z_content > div.z_detail_content > div:nth-child(5) > div.ant-spin-nested-loading > div > ul'); |
||||
|
// 获取列表的第一个元素,获取成功的话,元素封装对象的length = 1
|
||||
|
li = ul.children('li').first() |
||||
|
item = {} |
||||
|
while (li.length == 1) |
||||
|
{ |
||||
|
a = li.find('div.ant-list-item-meta > div > h4 > span > a'); |
||||
|
item.title = $(a.children()['2']).attr('title'); |
||||
|
item.url = a.attr('href'); |
||||
|
item.updateTime = $(li.children()[1]).text(); |
||||
|
item.region = '宁波阳光采购'; |
||||
|
item.announcementType = '{{$announcement_type}}' |
||||
|
|
||||
|
lists.push(item) |
||||
|
// 取下一个列表元素
|
||||
|
li = li.next() |
||||
|
} |
||||
|
|
||||
|
results.count = lists.length |
||||
|
results.lists = lists |
||||
|
return results |
||||
|
} |
File diff suppressed because it is too large
File diff suppressed because it is too large
Loading…
Reference in new issue