commit
1fb57e6877
35 changed files with 4828 additions and 0 deletions
@ -0,0 +1,3 @@ |
|||
# Default ignored files |
|||
/shelf/ |
|||
/workspace.xml |
@ -0,0 +1,8 @@ |
|||
<project version="4"> |
|||
<component name="Black"> |
|||
<option name="sdkName" value="Python 3.9 (PyGuoyan)" /> |
|||
</component> |
|||
<component name="ProjectRootManager"> |
|||
<output url="file://$PROJECT_DIR$/out" /> |
|||
</component> |
|||
</project> |
@ -0,0 +1,8 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="ProjectModuleManager"> |
|||
<modules> |
|||
<module fileurl="file://$PROJECT_DIR$/PyGuoyan.iml" filepath="$PROJECT_DIR$/PyGuoyan.iml" /> |
|||
</modules> |
|||
</component> |
|||
</project> |
@ -0,0 +1,6 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="PersistentConfig"> |
|||
<option name="langCode" value="en" /> |
|||
</component> |
|||
</project> |
@ -0,0 +1,6 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<project version="4"> |
|||
<component name="VcsDirectoryMappings"> |
|||
<mapping directory="$PROJECT_DIR$" vcs="Git" /> |
|||
</component> |
|||
</project> |
@ -0,0 +1,9 @@ |
|||
<?xml version="1.0" encoding="UTF-8"?> |
|||
<module type="PYTHON_MODULE" version="4"> |
|||
<component name="NewModuleRootManager" inherit-compiler-output="true"> |
|||
<exclude-output /> |
|||
<content url="file://$MODULE_DIR$" /> |
|||
<orderEntry type="jdk" jdkName="Python 3.9 (PyGuoyan)" jdkType="Python SDK" /> |
|||
<orderEntry type="sourceFolder" forTests="false" /> |
|||
</component> |
|||
</module> |
@ -0,0 +1,14 @@ |
|||
#!/usr/bin/python3 |
|||
from splash.gysplash import SBase |
|||
import json |
|||
|
|||
class SYgcg(SBase): |
|||
def open(self): |
|||
return super().open('ygcg', pages=2, annoucement_type='政府采购') |
|||
|
|||
if __name__ == '__main__': |
|||
test = SYgcg() |
|||
r = test.open() |
|||
|
|||
results = json.loads(r.text) |
|||
print(results) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,21 @@ |
|||
-----BEGIN CERTIFICATE----- |
|||
MIIDdTCCAl2gAwIBAgILBAAAAAABFUtaw5QwDQYJKoZIhvcNAQEFBQAwVzELMAkG |
|||
A1UEBhMCQkUxGTAXBgNVBAoTEEdsb2JhbFNpZ24gbnYtc2ExEDAOBgNVBAsTB1Jv |
|||
b3QgQ0ExGzAZBgNVBAMTEkdsb2JhbFNpZ24gUm9vdCBDQTAeFw05ODA5MDExMjAw |
|||
MDBaFw0yODAxMjgxMjAwMDBaMFcxCzAJBgNVBAYTAkJFMRkwFwYDVQQKExBHbG9i |
|||
YWxTaWduIG52LXNhMRAwDgYDVQQLEwdSb290IENBMRswGQYDVQQDExJHbG9iYWxT |
|||
aWduIFJvb3QgQ0EwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDaDuaZ |
|||
jc6j40+Kfvvxi4Mla+pIH/EqsLmVEQS98GPR4mdmzxzdzxtIK+6NiY6arymAZavp |
|||
xy0Sy6scTHAHoT0KMM0VjU/43dSMUBUc71DuxC73/OlS8pF94G3VNTCOXkNz8kHp |
|||
1Wrjsok6Vjk4bwY8iGlbKk3Fp1S4bInMm/k8yuX9ifUSPJJ4ltbcdG6TRGHRjcdG |
|||
snUOhugZitVtbNV4FpWi6cgKOOvyJBNPc1STE4U6G7weNLWLBYy5d4ux2x8gkasJ |
|||
U26Qzns3dLlwR5EiUWMWea6xrkEmCMgZK9FGqkjWZCrXgzT/LCrBbBlDSgeF59N8 |
|||
9iFo7+ryUp9/k5DPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNVHRMBAf8E |
|||
BTADAQH/MB0GA1UdDgQWBBRge2YaRQ2XyolQL30EzTSo//z9SzANBgkqhkiG9w0B |
|||
AQUFAAOCAQEA1nPnfE920I2/7LqivjTFKDK1fPxsnCwrvQmeU79rXqoRSLblCKOz |
|||
yj1hTdNGCbM+w6DjY1Ub8rrvrTnhQ7k4o+YviiY776BQVvnGCv04zcQLcFGUl5gE |
|||
38NflNUVyRRBnMRddWQVDf9VMOyGj/8N7yy5Y0b2qvzfvGn9LhJIZJrglfCm7ymP |
|||
AbEVtQwdpf5pLGkkeB6zpxxxYu7KyJesF12KwvhHhm4qxFYxldBniYUr+WymXUad |
|||
DKqC5JlR3XC321Y9YeRq4VzW9v493kHMB65jUr9TU/Qr6cf9tveCX4XSQRjbgbME |
|||
HMUfpIBvFSDJ3gyICh3WZlXi/EjJKSZp4A== |
|||
-----END CERTIFICATE----- |
@ -0,0 +1,5 @@ |
|||
[database] |
|||
host = localhost |
|||
database = guoyantest |
|||
user = root |
|||
password = Guoyan83086775 |
@ -0,0 +1,714 @@ |
|||
#!/usr/bin/python3 |
|||
""" |
|||
=========================================================================================== |
|||
这是一个用于爬取采购信息的模块 |
|||
要处理采购公告信息。主要涉及sc_cggg, calalog, catalogdata, readlog四张表 |
|||
=========================================================================================== |
|||
class Crawler: |
|||
def __init__(self, connect): |
|||
def generate_id(self): |
|||
def write_log_information(self, data_id, catalog_name): |
|||
def CrawlPage_gzw_ningbo(self, page): # 宁波国资委市属国企招标投标信息 |
|||
def CrawlPage_zjcs_nbxzfw(self, type, page): # 宁波市中介超市 |
|||
def CrawlPage_ygcg_nbcqjy_org(self, page): # 宁波市阳光采购 |
|||
def CrawlPage_zfcg_czt_zj(self, page): # 浙江政府采购网 |
|||
def CrawlPage_cbbidding(self, page): # 宁波中基国际招标有限公司 |
|||
def CrawlPage_zmeetb(self, page): # 浙江国际招标有限公司 |
|||
def CrawlPage_nbbidding(self, page): # 宁波国际招标有限公司 |
|||
============================================================================================ |
|||
""" |
|||
|
|||
import datetime |
|||
import hashlib |
|||
import pymysql |
|||
import json |
|||
import random |
|||
from requests_html import HTMLSession |
|||
from requests_html import HTML, UserAgent |
|||
import gymailer |
|||
import time |
|||
|
|||
''' |
|||
============================================================ |
|||
这个类用来封装splash服务 |
|||
其中: |
|||
self.splash_ip 参数是splash服务的ip |
|||
============================================================ |
|||
''' |
|||
|
|||
class Splash: |
|||
def __init__(self): |
|||
self.splash_ip = '127.0.0.1' |
|||
|
|||
''' |
|||
============================================================ |
|||
wait_for参数用来制定需要等待的元素,只有该元素渲染完成,程序才能染回,否则将等待200秒。wait_for 参数采购选择器的方式,如 |
|||
如制定元素id, 采用“#app"形式,如制定元素class, 采用 '.class-name'形式。 |
|||
============================================================ |
|||
''' |
|||
def post(self, url, wait_for, pages=1, page_element='', headers={'content-type':'application/json','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'}): |
|||
lua_scripts = """ |
|||
function wait_for_element(splash, css, maxwait) |
|||
-- Wait until a selector matches an element |
|||
-- in the page. Return an error if waited more |
|||
-- than maxwait seconds. |
|||
if maxwait == nil then |
|||
maxwait = 10 |
|||
end |
|||
return splash:wait_for_resume(string.format([[ |
|||
function main(splash) { |
|||
var selector = '%s'; |
|||
var maxwait = %s; |
|||
var end = Date.now() + maxwait*1000; |
|||
|
|||
function check() { |
|||
if(document.querySelector(selector)) { |
|||
splash.resume('Element found'); |
|||
} else if(Date.now() >= end) { |
|||
var err = 'Timeout waiting for element'; |
|||
splash.error(err + " " + selector); |
|||
} else { |
|||
setTimeout(check, 200); |
|||
} |
|||
} |
|||
check(); |
|||
} |
|||
]], css, maxwait)) |
|||
end |
|||
|
|||
function main(splash, args) |
|||
pages = """ + str(pages) + """ |
|||
page_element = '""" + page_element + """' |
|||
wait_for = '""" + wait_for + """' |
|||
splash:go('""" + url + """') |
|||
wait_for_element(splash, wait_for) |
|||
wait_for_element(splash, page_element) |
|||
|
|||
-- 将第一页的结果加入返回结果集中 |
|||
results = {splash.html()} |
|||
|
|||
if pages == 1 then |
|||
return results |
|||
else |
|||
-- 执行翻页动作 |
|||
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页 |
|||
for i = 2, pages do |
|||
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件 |
|||
js = string.format("document.querySelector('%s').click();", page_element) |
|||
|
|||
-- 执行翻页脚本 |
|||
splash:runjs(js) |
|||
|
|||
-- 等待页面加载完成 |
|||
wait_for_element(splash, wait_for) |
|||
wait_for_element(splash, page_element) |
|||
|
|||
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新 |
|||
assert(splash:wait(5)) |
|||
|
|||
-- 将页面加入返回结果集中 |
|||
table.insert(results, splash.html()) |
|||
end |
|||
return results |
|||
end |
|||
end |
|||
""" |
|||
|
|||
splash_url = 'http://' + self.splash_ip + ':8050/execute' |
|||
data = json.dumps({'lua_source':lua_scripts}) |
|||
r = HTMLSession().post(splash_url, headers=headers, data=data) |
|||
return r |
|||
|
|||
|
|||
class Crawler: |
|||
def __init__(self, connect): |
|||
self.connect = connect |
|||
|
|||
def generate_id(self): |
|||
# 用于生成一个32位的ID号 |
|||
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + str(random.randint(0, 1000000)) |
|||
md5_hash = hashlib.md5() |
|||
md5_hash.update(current_time.encode('utf-8')) |
|||
return md5_hash.hexdigest() |
|||
|
|||
def write_log_information(self, data_id, catalog_name, log_type='采购公告'): |
|||
# 添加了一条信息,需要同步更新其他相关信息, 包含对话框信息和日志信息两项 |
|||
with self.connect.cursor() as cursor: |
|||
affected_row = cursor.execute("select id from catalog where name = '%s'" % (log_type)) |
|||
if affected_row == 0: |
|||
return False |
|||
|
|||
result = cursor.fetchall() |
|||
catalog_id = result[0][0] |
|||
catalogdata_id = self.generate_id() |
|||
readlog_id = self.generate_id() |
|||
|
|||
affected_row = cursor.execute("SELECT staffid FROM userinfo where username = 'root'") |
|||
if affected_row == 0: |
|||
return False |
|||
|
|||
result = cursor.fetchall() |
|||
staff_id = result[0][0] |
|||
add_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|||
|
|||
affected_row = cursor.execute( |
|||
'insert into catalogdata (id, dataid, catalogid, creatorid, menderid, adddate, modifydate, datastatus) values (%s, %s, %s, %s, %s, %s, %s, %s)', |
|||
(catalogdata_id, data_id, catalog_id, staff_id, staff_id, add_date, add_date, 0)) |
|||
|
|||
cursor.execute( |
|||
'insert into readlog (id, dataid, staffid, readnum, adddate, LastAccessDate, resid) values (%s, %s, %s, %s, %s, %s, %s)', |
|||
(readlog_id, data_id, staff_id, 1, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), catalog_id)) |
|||
|
|||
return True |
|||
|
|||
|
|||
def write_information(self, title, url, region, publishTime, announcementType): |
|||
# 用于将一条信息写入数据库中 |
|||
with self.connect.cursor() as cursor: |
|||
cggg_id = self.generate_id() |
|||
|
|||
|
|||
try: |
|||
title = title.replace("'", "\\\'") |
|||
affected_rows = cursor.execute( |
|||
'insert into sc_cggg (id, bt, lj, ssqy, fbsj, gglb) values (%s, %s, %s, %s, %s, %s)', |
|||
(cggg_id, title, url, region, publishTime, announcementType)) |
|||
except pymysql.err.IntegrityError: |
|||
print('信息重复') |
|||
self.connect.rollback() |
|||
return False |
|||
else: |
|||
if self.write_log_information(cggg_id, announcementType): |
|||
self.connect.commit() |
|||
else: |
|||
print('添加采购信息失败') |
|||
self.connect.rollback() |
|||
return False |
|||
|
|||
return True |
|||
|
|||
def write_information_cgyx(self, cgyx): |
|||
# 用于将一条信息写入数据库中 |
|||
|
|||
with self.connect.cursor() as cursor: |
|||
cgyx_id = self.generate_id() |
|||
cgyx['cgxmmc'] = cgyx['cgxmmc'].replace("'", "\\\'") |
|||
strSql = 'insert into sc_cgyx (id, cgxmmc, lj, cgxqqk, ysje, yjcgsj, ly) values (\''+cgyx_id+'\',\''+cgyx['cgxmmc']+'\',\''+cgyx['lj']+'\',\''+cgyx['cgxqqk']+'\',\''+cgyx['ysje']+'\',\''+cgyx['yjcgsj']+'\',\''+cgyx['ly']+'\')' |
|||
try: |
|||
affected_rows = cursor.execute(strSql) |
|||
except pymysql.err.IntegrityError: |
|||
print('信息重复') |
|||
#self.connect.rollback() |
|||
return False |
|||
else: |
|||
if self.write_log_information(cgyx_id, '采购意向'): |
|||
self.connect.commit() |
|||
else: |
|||
print('添加采购信息失败') |
|||
self.connect.rollback() |
|||
return False |
|||
|
|||
return True |
|||
|
|||
def Check(self): |
|||
with self.connect.cursor() as cursor: |
|||
affected_row = cursor.execute("select id as total from sc_cggg where date(fbsj) > (NOW() - INTERVAL 1 DAY);") |
|||
if affected_row == 0: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息', '采购信息提取不正常,请检查!') |
|||
return False |
|||
else: |
|||
return True |
|||
|
|||
|
|||
def Crawl(self): |
|||
# 这个方法是实际完成爬取工作的总入口。 |
|||
|
|||
# 爬取浙江政采网的信息 |
|||
print('开始获取浙江政采网的信息\n') |
|||
|
|||
# 定义要传递进去的关于公告信息类型的数据结构 |
|||
infoType = [ |
|||
{"announcementCode": "110-175885", "announcementType":"采购意向"}, |
|||
{"announcementCode": "110-978863", "announcementType":"采购公告"}, |
|||
{"announcementCode": "110-943756", "announcementType":"更正公告"}, |
|||
{"announcementCode": "110-420383", "announcementType":"非政府采购公告"}, |
|||
{"announcementCode": "110-900461", "announcementType":"结果公告"} |
|||
] |
|||
for typeParam in infoType: |
|||
for page in range(1, 11): |
|||
try: |
|||
self.CrawlPage_zfcg_czt_zj(page, typeParam) |
|||
except Exception as e: |
|||
print('3--------------------------------', e) |
|||
|
|||
# 爬取宁波市阳光采购网的信息 |
|||
print('开始获取宁波市阳光采购网的信息\n') |
|||
infoType = [ |
|||
{"announcementCode": "21", "announcementType":"采购公告"}, |
|||
{"announcementCode": "23", "announcementType":"更正公告"}, |
|||
{"announcementCode": "22", "announcementType":"结果公告"} |
|||
] |
|||
for typeParam in infoType: |
|||
try: |
|||
self.CrawlPage_ygcg_nbcqjy_org(2, typeParam) |
|||
except Exception as e: |
|||
print('4--------------------------------', e) |
|||
|
|||
# 爬取宁波市中介超市网的信息 |
|||
print('开始获取宁波市中介超市网的信息\n') |
|||
infoType = [ |
|||
{"announcementCode": '1', "announcementType":"项目需求公告"}, |
|||
{"announcementCode": '2', "announcementType":"结果公告"} |
|||
] |
|||
|
|||
for typeParam in infoType: |
|||
for page in range(1, 6): |
|||
try: |
|||
self.CrawlPage_zjcs_nbxzfw(page, typeParam) |
|||
except Exception as e: |
|||
print('5------------------------------', e) |
|||
|
|||
# 爬取宁波市国资委市属企业采购信息 |
|||
print('开始获取宁波市国资委市属企业招投标网的信息\n') |
|||
for page in range(1, 5): |
|||
try: |
|||
self.CrawlPage_gzw_ningbo(page) |
|||
except Exception as e: |
|||
print('6------------------------------', e) |
|||
|
|||
# 爬取宁波中基国际招标网的信息 |
|||
print('开始获取宁波中基国际招标网的信息\n') |
|||
infoType = [ |
|||
{"announcementCode": "22", "announcementType":"采购公告"}, |
|||
{"announcementCode": "23", "announcementType":"结果公告"} |
|||
] |
|||
|
|||
for typeParam in infoType: |
|||
for page in range(1, 6): |
|||
try: |
|||
self.CrawlPage_cbbidding(page, typeParam) |
|||
except Exception as e: |
|||
print('7--------------------------------', e) |
|||
|
|||
# 爬取浙江国际招标网的信息 |
|||
print('开始获取浙江国际招标网的信息\n') |
|||
infoType = [ |
|||
{"announcementCode": "Zbgg", "announcementType":"采购公告"}, |
|||
{"announcementCode": "Gzgg", "announcementType":"更正公告"}, |
|||
{"announcementCode": "jggg", "announcementType":"结果公告"} |
|||
] |
|||
|
|||
for typeParam in infoType: |
|||
for page in range(1, 5): |
|||
try: |
|||
self.CrawlPage_zmeetb(page, typeParam) |
|||
except Exception as e: |
|||
print('8----------------------------', e) |
|||
|
|||
|
|||
# 爬取宁波市国际招标有限公司网站 |
|||
print('开始获取宁波国际招标网的信息\n') |
|||
|
|||
# 定义要传递进去的关于公告信息类型的数据结构 |
|||
infoType = [ |
|||
{"announcementCode": "1", "announcementType":"采购公告"}, |
|||
{"announcementCode": "1", "announcementType":"结果公告"}, |
|||
{"announcementCode": "2", "announcementType":"采购公告"}, |
|||
{"announcementCode": "2", "announcementType":"结果公告"} |
|||
] |
|||
for typeParam in infoType: |
|||
for page in range(1, 5): |
|||
try: |
|||
self.CrawlPage_nbbidding(page, typeParam) |
|||
except Exception as e: |
|||
print('9--------------------------------', e) |
|||
|
|||
# 爬取宁波名诚招标代理有限公司网站 |
|||
print('开始获取宁波名城招标的信息\n') |
|||
|
|||
# 定义要传递进去的关于公告信息类型的数据结构 |
|||
infoType = [ |
|||
{"announcementCode": "99", "announcementType":"采购公告"}, |
|||
{"announcementCode": "88", "announcementType":"结果公告"} |
|||
] |
|||
for typeParam in infoType: |
|||
for page in range(1, 2): |
|||
try: |
|||
self.CrawlPage_nbmcbidding(page, typeParam) |
|||
except Exception as e: |
|||
print('10--------------------------------', e) |
|||
|
|||
|
|||
# 宁波中基国际招标有限公司 https://www.cbbidding.com/ |
|||
def CrawlPage_cbbidding(self, page, typeParam): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
session = HTMLSession() |
|||
session.DEFAULT_RETRIES = 5 |
|||
url = 'https://www.cbbidding.com/Index/cms.html?mid=' +typeParam['announcementCode'] + '&%2FIndex%2Fcms%2Fmid%2F' + typeParam['announcementCode'] + '_html=&page=' + str(page) |
|||
|
|||
headers = { |
|||
"Accept": "application/json, text/javascript, */*; q=0.01", |
|||
"Accept-Encoding": "gzip, deflate", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"Connection": "keep-alive", |
|||
"DNT": '1', |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|||
} |
|||
|
|||
|
|||
# 这个网站返回的是一个网页,所以需要进行网页解析 |
|||
r = session.get(url = url, headers = headers) |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中基国际招标网', r.text) |
|||
return False |
|||
|
|||
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
|||
data = r.html.xpath('/html/body/div[3]/div[3]/div[2]/div[2]/div/ul/li') |
|||
for item in data: |
|||
title = item.xpath('//a')[0].text |
|||
url = 'https://www.cbbidding.com' + item.xpath('//a')[0].attrs.get('href') |
|||
region = '中基招标' |
|||
publishDate = item.xpath('//div')[0].text |
|||
|
|||
try: |
|||
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d')) |
|||
except Exception as e: |
|||
publishDate = publishDate.replace('.', '-') |
|||
publishDate = str(datetime.datetime.strptime(publishDate, '%Y-%m-%d')) |
|||
|
|||
print(url, title) |
|||
announcementType = typeParam['announcementType'] |
|||
#print(title, url, region, publishDate, announcementType) |
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
|
|||
# 浙江国际招投标有限公司 https://www.zmeetb.com/ |
|||
def CrawlPage_zmeetb(self, page, typeParam): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
session = HTMLSession() |
|||
url = 'https://www.zmeetb.com/' +typeParam['announcementCode'] + '/index/p/' + str(page) + '.html' |
|||
|
|||
headers = { |
|||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", |
|||
"Accept-Encoding": "gzip, deflate, br", |
|||
"Cache-Control": "max-age=0", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"Connection": "close", |
|||
"DNT": '1', |
|||
"Host": "www.zmeetb.com", |
|||
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="99"', |
|||
"sec-ch-ua-mobile": "?0", |
|||
"sec-ch-ua-platform": "Windows", |
|||
"Sec-Fetch-Dest": "document", |
|||
"Sec-Fetch-Mode": "navigate", |
|||
"Sec-Fetch-Site": "none", |
|||
"Sec-Fetch-User": "?1", |
|||
"Upgrade-Insecure-Requests": "1", |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|||
} |
|||
|
|||
# 这个网站返回的是一个网页,所以需要进行网页解析 |
|||
# 这个网站如果使用render()函数,会遇到ssl证书问题,需要进一步研究chromium浏览器的证书问题 |
|||
#r = session.get(url = url, headers = headers, verify='/opt/PyGuoyan/www.zmeetb.com') |
|||
r = session.get(url = url, headers = headers, verify=False) |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:浙江国际招标网', r.text) |
|||
return False |
|||
|
|||
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
|||
data = r.html.xpath('/html/body/div[1]/div[3]/div[2]/div/div/div[3]/div/ul/li/a') |
|||
for item in data: |
|||
title = item.xpath('//p')[0].text |
|||
url = item.attrs.get('href') |
|||
region = '浙江国际招标' |
|||
publishDate = item.xpath('//p')[1].text |
|||
announcementType = typeParam['announcementType'] |
|||
|
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
|
|||
# 宁波市名诚招标有限有限公司 http://www.nbmcbidding.com/ |
|||
def CrawlPage_nbmcbidding(self, page, typeParam): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
session = HTMLSession() |
|||
if typeParam['announcementType'] == '采购公告': |
|||
url = "http://www.nbmcbidding.com/news/99/"+str(page)+"/" |
|||
else: |
|||
url = "http://www.nbmcbidding.com/news/88/"+str(page)+"/" |
|||
|
|||
|
|||
data = {} |
|||
headers = { |
|||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", |
|||
"Host": "www.nbmcbidding.com", |
|||
'Connection': 'keep-alive', |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|||
} |
|||
|
|||
r = session.get(url = url, headers = headers, json = data) |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波名诚招标代理有限公司', r.text) |
|||
return False |
|||
|
|||
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
|||
data = r.html.xpath('/html/body/div[1]/div/div[3]/div[2]/ul/li') |
|||
for item in data: |
|||
title = item.xpath('//a/div[2]')[0].text |
|||
url = item.xpath('//a')[0].attrs.get('href') |
|||
region = '宁波名诚招标' |
|||
publishDate = item.xpath('//a/div[4]')[0].text |
|||
announcementType = typeParam['announcementType'] |
|||
|
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
|
|||
# 宁波市国际招标有限公司 http://www.nbbidding.com/ |
|||
def CrawlPage_nbbidding(self, page, typeParam): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
session = HTMLSession() |
|||
if typeParam['announcementType'] == '采购公告': |
|||
url = "http://www.nbbidding.com/Home/Notice/news_list?page="+str(page)+"&is_Open=1&keyword" |
|||
|
|||
else: |
|||
url = "http://www.nbbidding.com/Home/Publicity/news_list?page="+str(page)+"&is_Open=1&keyword" |
|||
|
|||
|
|||
data = {} |
|||
headers = { |
|||
"Accept": "application/json, text/javascript, */*; q=0.01", |
|||
"Host": "www.nbbidding.com", |
|||
'Connection': 'keep-alive', |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|||
} |
|||
|
|||
r = session.get(url = url, headers = headers, json = data) |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国际招标网', r.text) |
|||
return False |
|||
|
|||
data = json.loads(r.text)['data'] |
|||
total = data['page']['count'] |
|||
data = data['list'] |
|||
|
|||
for item in data: |
|||
id = item['id'] |
|||
if typeParam['announcementType'] == '采购公告': |
|||
url = 'http://www.nbbidding.com/Home/Notice/news_detail?id=%s' % (id) |
|||
else: |
|||
url = 'http://www.nbbidding.com/Home/Publicity/news_detail?id=%s' % (id) |
|||
title = item['title'] |
|||
region = '宁波国际招标' |
|||
publishDate = item['addtime'] |
|||
announcementType = item['stage'] |
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
print(publishDate, title, url) |
|||
|
|||
# 宁波市国资委属企业招标信息网 |
|||
def CrawlPage_gzw_ningbo(self, page): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
session = HTMLSession() |
|||
url = 'http://gzw.ningbo.gov.cn/col/col1229663137/index.html?uid=6085425&pageNum=%s' % str(page) |
|||
|
|||
headers = { |
|||
"Accept": "application/json, text/javascript, */*; q=0.01", |
|||
"Accept-Encoding": "gzip, deflate", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"Connection": "keep-alive", |
|||
"DNT": '1', |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|||
} |
|||
|
|||
# 这个网站返回的是一个网页,所以需要进行网页解析 |
|||
r = session.get(url = url, headers = headers) |
|||
r.html.render() |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波市国资委市属企业招标信息网', r.text) |
|||
return False |
|||
|
|||
# 注意:xpath 函数返回的是list对象, 对象的元素是element |
|||
data = r.html.xpath('/html/body/div[2]/div[3]/div/div/div[2]/div/div/div/ul/li') |
|||
for item in data: |
|||
title = item.xpath('//a')[0].text |
|||
url = item.xpath('//a')[0].attrs.get('href') |
|||
region = '宁波市属国企' |
|||
publishDate = item.xpath('//p')[0].text |
|||
announcementType = '采购公告' |
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
|
|||
|
|||
# 宁波市中介超市网 |
|||
def CrawlPage_zjcs_nbxzfw(self, page, typeParam): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
# type 用于判别采购信息的类型 |
|||
session = HTMLSession() |
|||
urllist = ['http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0901&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15','http://zjcs.nbxzfw.gov.cn/newsweb/api/News/GetList?ClassId=0902&Type='+typeParam['announcementCode']+'&pageIndex='+str(page)+'&pageSize=15'] |
|||
|
|||
headers = { |
|||
"Accept": "application/json, text/javascript, */*; q=0.01", |
|||
"Accept-Encoding": "gzip, deflate", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"Connection": "keep-alive", |
|||
"DNT": '1', |
|||
"Host": "ygcg.nbcqjy.org", |
|||
"Referer": "http://zjcs.nbxzfw.gov.cn/newsweb/page/news/infolist.html?Type="+str(type), |
|||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.3.306" |
|||
} |
|||
|
|||
for url in urllist: |
|||
r = session.get(url = url, headers = headers) |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波中介超市网', r.text) |
|||
return False |
|||
|
|||
data = json.loads(r.text)['data'] |
|||
|
|||
total = data['total'] |
|||
data = data['rows'] |
|||
|
|||
for item in data: |
|||
articleId = item['AutoId'] |
|||
BulletinTypeId = item['BulletinTypeId'] |
|||
url = 'http://zjcs.nbxzfw.gov.cn/YWGG/Info?id=%s&Type=%s' % (articleId, BulletinTypeId) |
|||
title = item['BulletinTitle'] |
|||
region = '宁波中介超市' |
|||
publishDate = item['PublishDate'] |
|||
announcementType = typeParam['announcementType'] |
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
#print(publishDate, url) |
|||
|
|||
|
|||
# 宁波阳光采购网 |
|||
def CrawlPage_ygcg_nbcqjy_org(self, pages, typeParam): |
|||
url = 'https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType=' + typeParam['announcementCode'] |
|||
|
|||
wait_for = '.ant-pagination-item-ellipsis' |
|||
page_element = '.anticon-right' |
|||
try: |
|||
r = Splash().post(url, wait_for, pages=pages, page_element=page_element) |
|||
except Exception as e: |
|||
print(e) |
|||
|
|||
results = json.loads(r.text) |
|||
|
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, 错误代码:'+str(r.status_code), r.text) |
|||
return False |
|||
|
|||
for i in range(1, pages + 1): |
|||
data = HTML(html=results[str(i)]).xpath('/html/body/div/div/div[2]/div[2]/div/div/div[2]/div[2]/div[5]/div[1]/div/ul/li') |
|||
if len(data) == 0: |
|||
print('数据为空') |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波阳光采购网, keyerror', e) |
|||
return False |
|||
|
|||
for item in data: |
|||
url = 'http://ygcg.nbcqjy.org' + item.xpath('//a')[0].attrs.get('href') |
|||
title = item.xpath('//a/span[3]')[0].text |
|||
region = '宁波阳光采购' |
|||
publishDate = item.xpath('//div[2]')[0].text |
|||
announcementType = typeParam['announcementType'] |
|||
print(title) |
|||
self.write_information(title, url, region, publishDate, announcementType) |
|||
|
|||
|
|||
# 浙江政府采购网 |
|||
def CrawlPage_zfcg_czt_zj(self, page, typeParam): |
|||
# 这个方法是实际爬取指定页面的信息。 |
|||
session = HTMLSession() |
|||
url = 'https://zfcg.czt.zj.gov.cn/portal/category' |
|||
if typeParam['announcementCode'] == '110-420383': |
|||
data = { |
|||
"pageNo": page, |
|||
"pageSize": 15, |
|||
"categoryCode": typeParam['announcementCode'], |
|||
"districtCode": ["339900"], |
|||
"isProvince": True, |
|||
"includeGovDistrict": "1", |
|||
"_t": 1699104836000 |
|||
} |
|||
else: |
|||
data = { |
|||
"pageNo": page, |
|||
"pageSize": 15, |
|||
"categoryCode": typeParam['announcementCode'], |
|||
"isGov": True, |
|||
"excludeDistrictPrefix": "90", |
|||
"_t": 1699104836000 |
|||
} |
|||
|
|||
headers = { |
|||
"accept": "application/json, text/plain, */*", |
|||
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8", |
|||
"content-type": "application/json;charset=UTF-8", |
|||
"sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"99\"", |
|||
"sec-ch-ua-mobile": "?0", |
|||
"sec-ch-ua-platform": "\"Windows\"", |
|||
"sec-fetch-dest": "empty", |
|||
"sec-fetch-mode": "cors", |
|||
"sec-fetch-site": "same-origin", |
|||
"x-requested-with": "XMLHttpRequest" |
|||
} |
|||
|
|||
try: |
|||
r = session.post(url = url, headers = headers, json = data) |
|||
except Exception as e: |
|||
print('10-------------------------', e) |
|||
|
|||
if r.status_code != 200: |
|||
if page == 1: |
|||
gymailer.SendMail('jinqian_chen@126.com', 'jinqian.chen@srit.com.cn', '爬虫警告信息:宁波政府采购网', r.text) |
|||
return False |
|||
|
|||
data = json.loads(r.text)['result']['data'] |
|||
total = data['total'] |
|||
data = data['data'] |
|||
|
|||
for item in data: |
|||
publishDate = datetime.datetime.fromtimestamp(item['publishDate']/1000) |
|||
pageUrl = 'https://zfcg.czt.zj.gov.cn/luban/detail?parentId=600007&articleId=' + item['articleId'] + '&utm=luban.luban-PC-37000.979-pc-websitegroup-zhejiang-secondPage-front.21.320086307d6811ee86314be74945ec2c' |
|||
detailUrl = 'https://zfcg.czt.zj.gov.cn/portal/detail?articleId=' + item['articleId'] |
|||
announcementType = typeParam['announcementType'] |
|||
if announcementType == '采购意向': |
|||
r = session.get(url = detailUrl, headers = headers) |
|||
|
|||
detailData = json.loads(r.text)['result']['data'] |
|||
if detailData == None: |
|||
break |
|||
|
|||
content = HTML(html='<xml>'+detailData['content']+'</xml>') |
|||
region = item['districtName'] |
|||
for detailItem in content.xpath('xml/div/div/div[1]/div/table/tbody/tr'): |
|||
title = detailItem.xpath('//td[2]')[0].text |
|||
cgxqqk = detailItem.xpath('//td[3]')[0].text |
|||
ysje = detailItem.xpath('//td[4]')[0].text |
|||
yjcgsj = detailItem.xpath('//td[5]')[0].text |
|||
ly = detailData["title"] |
|||
|
|||
self.write_information(title, pageUrl, region, publishDate, announcementType) |
|||
self.write_information_cgyx({'cgxmmc':title,'lj':pageUrl, 'cgxqqk':cgxqqk, 'ysje':ysje, 'yjcgsj':yjcgsj, 'ly':ly}) |
|||
else: |
|||
title = item['title'] |
|||
region = item['districtName'] |
|||
self.write_information(title, pageUrl, region, publishDate, announcementType) |
|||
|
|||
#print(publishDate, url) |
|||
|
|||
|
|||
return True |
@ -0,0 +1,137 @@ |
|||
#!/usr/bin/python3 |
|||
|
|||
import pymysql |
|||
from properties import Properties |
|||
import sys, getopt |
|||
|
|||
class DbSearch: |
|||
# 本类用于提供各类数据库信息搜索服务 |
|||
def __init__(self, connect): |
|||
self.connect = connect |
|||
|
|||
def GetTableList(self, database): |
|||
# 查询某个库的数据表的列表 |
|||
cursorTable = self.connect.cursor() |
|||
cursorTable.execute("SELECT table_name FROM INFORMATION_SCHEMA.TABLES where table_schema = '" + database + "'"); |
|||
|
|||
return cursorTable.fetchall() |
|||
|
|||
def GetColumnList(self, tableName): |
|||
# 查询某张表的数据字段列表 |
|||
cursorColumn = self.connect.cursor() |
|||
cursorColumn.execute("SELECT column_name,data_type FROM INFORMATION_SCHEMA.COLUMNS where table_schema='" + database + "' AND table_name='" + |
|||
tableName + "'"); |
|||
return cursorColumn.fetchall() |
|||
|
|||
def SearchTableByColumnName(self, columnName, database): |
|||
# 查询包含包含searchText的库表 |
|||
tableList = self.GetTableList(database) |
|||
findList = list() |
|||
for table in tableList: |
|||
columnList = self.GetColumnList(table[0]) |
|||
for column in columnList: |
|||
if column[0].find(columnName) != -1: |
|||
findList.append(table[0]) |
|||
|
|||
return findList |
|||
|
|||
def SearchTableByText(self, searchText, database): |
|||
# 查找包含searchText字符串的表,并显示相应的表记录 |
|||
tableList = self.GetTableList(database) |
|||
if len(tableList) == 0: |
|||
return False |
|||
|
|||
found = 0 |
|||
findList = list() |
|||
for table in tableList: |
|||
strSql = "SELECT '" + table[0] + "' as table_name, t.* " |
|||
strSql = strSql + " FROM " + database + "." + table[0] + " as t where " + "(" |
|||
|
|||
columnList = self.GetColumnList(table[0]) |
|||
i = 0 |
|||
|
|||
count = len(columnList) |
|||
|
|||
for column in columnList: |
|||
# 如果字段数据类型为非文本型,跳过 |
|||
if not column[1] in ('varchar', 'char', 'text'): |
|||
continue |
|||
i += 1 |
|||
|
|||
if i > 1: |
|||
strSql += " or " |
|||
strSql += column[0] + " like '%" + searchText + "%' " |
|||
|
|||
strSql += ")" |
|||
|
|||
cursorColumn = self.connect.cursor() |
|||
try: |
|||
cursorColumn.execute(strSql) |
|||
except Exception as e: |
|||
print('2----------------------------', database, strSql) |
|||
print("-----错误信息:-----\n", e) |
|||
return False |
|||
|
|||
result = cursorColumn.fetchall() |
|||
if len(result) > 0: |
|||
findList.append(table[0]) |
|||
print("==========================================================================") |
|||
print(table[0], result, strSql) |
|||
return findList |
|||
|
|||
if __name__ == '__main__': |
|||
print( |
|||
""" |
|||
============================================================ |
|||
|这是数据库全文检索工具,包含两个参数 | |
|||
============================================================ |
|||
""") |
|||
|
|||
# 设置运行环境。如果当前是测试环境,则将is_test设置为true |
|||
is_test = False |
|||
|
|||
if is_test: |
|||
file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties" |
|||
database = 'guoyantest' |
|||
else: |
|||
file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties" |
|||
database = 'guoyan' |
|||
|
|||
# 打开jdbc.properties文件,获取数据库的配置信息 |
|||
props = Properties(file_path) |
|||
host = 'localhost' |
|||
user = props.get('jdbc.username') |
|||
password = props.get('jdbc.password') |
|||
|
|||
# 打开数据连接 |
|||
db = pymysql.connect(host = host, user = user, password = password, database = database) |
|||
|
|||
# 获取命令行参数 |
|||
keyword = '' |
|||
searchType ='' |
|||
|
|||
keyword = '' |
|||
searchType = '' |
|||
try: |
|||
opts, args = getopt.getopt(sys.argv[1:],"hT:k:",["keyword=","searchType="]) |
|||
except getopt.GetoptError: |
|||
print(sys.argv[0] + ' -k <keyword> -T <searchType>') |
|||
sys.exit(2) |
|||
|
|||
for opt, arg in opts: |
|||
if opt == '-h': |
|||
print('3--------------------', 'test.py -k <keyword> -T <searchType>') |
|||
sys.exit() |
|||
elif opt in ("-k", "--keyword"): |
|||
keyword = arg |
|||
elif opt in ("-T", "--searchType"): |
|||
searchType = arg |
|||
dbSearch = DbSearch(db) |
|||
if searchType == '0': |
|||
print('正在根据您输入的关键词查找表.....................') |
|||
print('found tables: ', dbSearch.SearchTableByText(keyword, database)) |
|||
elif searchType == '1': |
|||
print('正在根据您输入的列名查找表.....................') |
|||
print('found tables: ', dbSearch.SearchTableByColumnName(keyword, database)) |
|||
|
|||
|
@ -0,0 +1,59 @@ |
|||
#!/usr/bin/python3 |
|||
"""这是爬虫的主程序主程序 |
|||
作者:陈进钱 |
|||
日期:2023/11/03 |
|||
""" |
|||
|
|||
import pymysql |
|||
import datetime |
|||
import time |
|||
from apscheduler.schedulers.blocking import BlockingScheduler |
|||
from properties import Properties |
|||
from crawler import Crawler |
|||
|
|||
print( |
|||
"""采购信息采集器 v1.0 |
|||
=================================================================================== |
|||
这个程序用于获取各大招投标网站的采购信息 |
|||
version: 1.0 |
|||
作者:陈进钱 |
|||
日期:2023-11-04 |
|||
===================================================================================""") |
|||
|
|||
# 设置运行环境。如果当前是测试环境,则将is_test设置为true |
|||
is_test = False |
|||
|
|||
if is_test: |
|||
file_path = "/opt/eresource_test/webapp/WEB-INF/classes/prod/jdbc.properties" |
|||
database = 'guoyantest' |
|||
else: |
|||
file_path = "/opt/eresource/webapp/WEB-INF/classes/prod/jdbc.properties" |
|||
database = 'guoyan' |
|||
|
|||
# 打开jdbc.properties文件,获取数据库的配置信息 |
|||
props = Properties(file_path) |
|||
host = 'localhost' |
|||
user = props.get('jdbc.username') |
|||
password = props.get('jdbc.password') |
|||
|
|||
# 打开数据连接 |
|||
connect = pymysql.connect(host = host, user = user, password = password, database = database) |
|||
|
|||
# 获取采购信息,并填写到数据库中 |
|||
crawler = Crawler(connect) |
|||
|
|||
# 启动自动爬取任务 |
|||
def crawl_check_func(): |
|||
crawler.Check() |
|||
|
|||
# 启动自动爬取任务 |
|||
def crawl_job_func(): |
|||
crawler.Crawl() |
|||
|
|||
sched = BlockingScheduler() |
|||
sched.add_job(crawl_job_func, 'interval', hours=3, jitter=120, max_instances=4) |
|||
sched.add_job(crawl_check_func, 'interval', days=1, jitter=120, max_instances=4) |
|||
sched.start() |
|||
|
|||
# 关闭数据库连接 |
|||
connect.close() |
@ -0,0 +1,44 @@ |
|||
import smtplib |
|||
from email.mime.text import MIMEText |
|||
from email.header import Header |
|||
|
|||
def SendMail(sender, receiver, subject, message): |
|||
# 发送邮件服务器 |
|||
smtp_server = 'smtp.126.com' |
|||
|
|||
# 发送邮件服务器端口 |
|||
smtp_port = 465 |
|||
|
|||
# 邮件对象 |
|||
msg = MIMEText(message, 'plain', 'utf-8') |
|||
msg['From'] = Header(sender, 'utf-8') |
|||
msg['To'] = Header(receiver, 'utf-8') |
|||
msg['Subject'] = Header(subject, 'utf-8') |
|||
|
|||
# SMTP对象 |
|||
smtpObj = smtplib.SMTP_SSL(smtp_server, smtp_port) |
|||
|
|||
# 登录SMTP服务器 |
|||
smtpObj.login(sender, 'ERXYFJRLKPTTDXWH') |
|||
|
|||
# 发送邮件 |
|||
smtpObj.sendmail(from_addr=sender,to_addrs=[receiver],msg=msg.as_string()) |
|||
|
|||
# 关闭SMTP连接 |
|||
smtpObj.quit() |
|||
|
|||
if __name__ == '__main__': |
|||
|
|||
# 发件人邮箱 |
|||
sender = 'jinqian_chen@126.com' |
|||
|
|||
# 收件人邮箱 |
|||
receiver = 'jinqian.chen@srit.com.cn' |
|||
|
|||
# 邮件主题 |
|||
subject = 'Python3发送邮件示例, new' |
|||
|
|||
# 邮件正文 |
|||
message = '这是一封Python3发送的邮件' |
|||
SendMail(sender, receiver, subject, message) |
|||
|
@ -0,0 +1,46 @@ |
|||
#Db2 |
|||
#hibernate.dialect=org.hibernate.dialect.DB2Dialect |
|||
#jdbc.driverClassName=com.ibm.db2.jcc.DB2Driver |
|||
#jdbc.url=jdbc:db2://localhost:50000/eaching |
|||
|
|||
#Oracle |
|||
#hibernate.dialect=org.hibernate.dialect.Oracle10gDialect |
|||
#jdbc.driverClassName=oracle.jdbc.driver.OracleDriver |
|||
#jdbc.url=jdbc:oracle:thin:@47.99.208.214:1521:orcl |
|||
#jdbc.url=jdbc:oracle:thin:@118.190.161.36:1521:orcl |
|||
|
|||
#SqlServer |
|||
#hibernate.dialect=org.hibernate.dialect.SQLServerDialect |
|||
#jdbc.driverClassName=net.sourceforge.jtds.jdbc.Driver |
|||
#jdbc.url=jdbc:jtds:sqlserver://localhost:1433/guanwaimatou;SelectMethod=Cursor |
|||
|
|||
|
|||
#MySql |
|||
hibernate.dialect=org.hibernate.dialect.MySQLDialect |
|||
jdbc.driverClassName=com.mysql.jdbc.Driver |
|||
jdbc.url=jdbc:mysql://116.62.210.190:3306/guoyantest?autoReconnect=true&useUnicode=true&characterEncoding=UTF8&mysqlEncoding=utf8&zeroDateTimeBehavior=convertToNull |
|||
|
|||
jdbc.username=root |
|||
jdbc.password=Guoyan83086775 |
|||
|
|||
jdbc.maxConn=20 |
|||
jdbc.minConn=5 |
|||
jdbc.activeTime=900000 |
|||
jdbc.alias=eaching |
|||
jdbc.keepingSleepTime=30000 |
|||
jdbc.maxConnectionLifetime=60000 |
|||
|
|||
jdbc.multiSource=false |
|||
|
|||
hibernate.cache.use_second_level_cache=true |
|||
hibernate.show_sql=false |
|||
hibernate.generate_statistics=false |
|||
hibernate.cache.provider_class=org.hibernate.cache.EhCacheProvider |
|||
#hibernate.cache.provider_class=net.oschina.j2cache.hibernate3.J2CacheProvider |
|||
hibernate.cache.use_minimal_puts=true |
|||
hibernate.cache.use_structured_entries=true |
|||
hibernate.cache.use_query_cache=true |
|||
hibernate.use_sql_comments=trues |
|||
hibernate.order_updates=true |
|||
hibernate.format_sql=false |
|||
hbm2ddl.auto=create |
@ -0,0 +1,75 @@ |
|||
#!/usr/bin/python3 |
|||
"""这是爬虫的主程序主程序 |
|||
作者:陈进钱 |
|||
日期:2023/11/03 |
|||
""" |
|||
|
|||
import pymysql |
|||
import datetime |
|||
import time |
|||
from apscheduler.schedulers.blocking import BlockingScheduler |
|||
from properties import Properties |
|||
from crawler import Crawler |
|||
import sys |
|||
import os |
|||
|
|||
print( |
|||
"""采购信息采集器 v1.0 |
|||
=================================================================================== |
|||
这个程序用于获取各大招投标网站的采购信息 |
|||
version: 1.0 |
|||
作者:陈进钱 |
|||
日期:2023-11-04 |
|||
===================================================================================""") |
|||
|
|||
# 设置运行环境。如果当前是测试环境,则将is_test设置为true |
|||
is_test = True |
|||
if is_test: |
|||
root = "/opt/eresource_test/webapp/WEB-INF/classes/prod/" |
|||
else: |
|||
root = "/opt/eresource/webapp/WEB-INF/classes/prod/" |
|||
|
|||
if os.path.exists(root): |
|||
file_path = root + "jdbc.properties" |
|||
else: |
|||
file_path = "jdbc.properties" |
|||
|
|||
if sys.platform == 'win32': |
|||
host = '116.62.210.190' |
|||
user = 'root' |
|||
password = 'Guoyan83086775' |
|||
if is_test: |
|||
database = 'guoyantest' |
|||
else: |
|||
database = 'guoyan' |
|||
else: |
|||
if is_test: |
|||
database = 'guoyantest' |
|||
else: |
|||
database = 'guoyan' |
|||
|
|||
# 打开jdbc.properties文件,获取数据库的配置信息 |
|||
props = Properties(file_path) |
|||
host = '116.62.210.190' |
|||
user = props.get('jdbc.username') |
|||
password = props.get('jdbc.password') |
|||
|
|||
# 打开数据连接 |
|||
connect = pymysql.connect(host = host, user = user, password = password, database = database) |
|||
|
|||
# 获取采购信息,并填写到数据库中 |
|||
crawler = Crawler(connect) |
|||
crawler.Crawl() |
|||
#crawler.CrawlPage_ygcg_nbcqjy_org(1, {"announcementCode": "21", "announcementType":"采购公告"}) |
|||
#print(crawler.Check()) |
|||
|
|||
# 启动自动爬取任务 |
|||
#def crawl_job_func(): |
|||
# crawler.Crawl() |
|||
|
|||
#sched = BlockingScheduler() |
|||
#sched.add_job(crawl_job_func, 'interval', hours=1, jitter=120) |
|||
#sched.start() |
|||
|
|||
# 关闭数据库连接 |
|||
connect.close() |
Binary file not shown.
@ -0,0 +1,72 @@ |
|||
#!/usr/bin/python |
|||
# -*- coding: UTF-8 -*- |
|||
|
|||
import re |
|||
import os |
|||
import tempfile |
|||
|
|||
|
|||
class Properties: |
|||
|
|||
def __init__(self, file_name): |
|||
# 如果配置文件不存在,取本地文件 |
|||
if not os.path.exists(file_name): |
|||
file_name = 'jdbc.properties' |
|||
|
|||
self.file_name = file_name |
|||
self.properties = {} |
|||
try: |
|||
fopen = open(self.file_name, 'r') |
|||
for line in fopen: |
|||
line = line.strip() |
|||
if line.find('=') > 0 and not line.startswith('#'): |
|||
strs = line.split('=') |
|||
self.properties[strs[0].strip()] = strs[1].strip() |
|||
except Exception as e: |
|||
raise e |
|||
else: |
|||
fopen.close() |
|||
|
|||
def has_key(self, key): |
|||
return key in self.properties |
|||
|
|||
def get(self, key, default_value=''): |
|||
if key in self.properties: |
|||
return self.properties[key] |
|||
return default_value |
|||
|
|||
def put(self, key, value): |
|||
self.properties[key] = value |
|||
replace_property(self.file_name, key + '=.*', key + '=' + value, True) |
|||
|
|||
|
|||
def replace_property(file_name, from_regex, to_str, append_on_not_exists=True): |
|||
tmpfile = tempfile.TemporaryFile() |
|||
|
|||
if os.path.exists(file_name): |
|||
r_open = open(file_name, 'r') |
|||
pattern = re.compile(r'' + from_regex) |
|||
found = None |
|||
for line in r_open: |
|||
if pattern.search(line) and not line.strip().startswith('#'): |
|||
found = True |
|||
line = re.sub(from_regex, to_str, line) |
|||
tmpfile.write(line.encode()) |
|||
if not found and append_on_not_exists: |
|||
tmpfile.write(('\n' + to_str).encode()) |
|||
r_open.close() |
|||
tmpfile.seek(0) |
|||
|
|||
content = tmpfile.read() |
|||
|
|||
if os.path.exists(file_name): |
|||
os.remove(file_name) |
|||
|
|||
w_open = open(file_name, 'wb') |
|||
w_open.write(content) |
|||
w_open.close() |
|||
|
|||
tmpfile.close() |
|||
else: |
|||
print ("file %s not found" % file_name) |
|||
|
@ -0,0 +1,14 @@ |
|||
#!/usr/bin/python3 |
|||
from splash.gysplash import SBase |
|||
import json |
|||
|
|||
class SYgcg(SBase): |
|||
def open(self): |
|||
return super().open('ygcg', pages=2, annoucement_type='政府采购') |
|||
|
|||
if __name__ == '__main__': |
|||
test = SYgcg() |
|||
r = test.open() |
|||
|
|||
results = json.loads(r.text) |
|||
print(results) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,7 @@ |
|||
#This is splash config file. |
|||
[splash service settings] |
|||
server = localhost |
|||
port = 8050 |
|||
|
|||
|
|||
|
@ -0,0 +1,20 @@ |
|||
{ |
|||
"description": "This is splash config file.", |
|||
"server": "127.0.0.1", |
|||
"port": "8050", |
|||
"class":{ |
|||
"SYgcg":{ |
|||
"url":"https://ygcg.nbcqjy.org/list?type=2&class=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA¬iceType={{ $noticeType }}", |
|||
"_comment":"http://www.baidu.com", |
|||
"param":{ |
|||
"noticeType":"21" |
|||
}, |
|||
"wait_for":".ant-list-items", |
|||
"page_element":".anticon-right", |
|||
"headers":{ |
|||
"content-type":"application/json", |
|||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0" |
|||
} |
|||
} |
|||
} |
|||
} |
@ -0,0 +1 @@ |
|||
This is lua script file |
@ -0,0 +1,122 @@ |
|||
#!/usr/bin/python3 |
|||
'''=================================================================== |
|||
这个模块用于对 splash 服务进行封装,方便在 python 中使用。 |
|||
版本:1.0 |
|||
作者:陈进钱 |
|||
日期:2023-12-18 |
|||
===================================================================''' |
|||
import os |
|||
import re |
|||
import json |
|||
import codecs |
|||
import configparser |
|||
from requests_html import HTMLSession |
|||
from requests_html import HTML |
|||
|
|||
config = configparser.ConfigParser() |
|||
# splash 基类 |
|||
class SBase: |
|||
def __init__(self): |
|||
self.__lua_script = '' |
|||
self.config = {} |
|||
|
|||
# 创建 ConfigParser 对象 |
|||
self.root = os.path.dirname(os.path.abspath(__file__)) |
|||
|
|||
# 自动创建配置文件 |
|||
dir = self.root + '/config' |
|||
if not os.path.exists(dir): |
|||
os.makedirs(dir) |
|||
|
|||
file_path = self.root + '/config/splash.json' |
|||
|
|||
if os.path.exists(file_path): |
|||
file = codecs.open(file_path, 'r', 'utf-8') |
|||
content = file.read() |
|||
self.config = json.loads(content) |
|||
file.close() |
|||
else: |
|||
self.config['description'] = 'This is splash config file.' |
|||
self.config['server'] = 'localhost' |
|||
self.config['port'] = '8050' |
|||
|
|||
content = json.dumps(self.config) |
|||
with codecs.open(file_path, 'w', 'utf-8') as file: |
|||
file.write(content) |
|||
|
|||
# 自动创建空的脚本文件 |
|||
dir = self.root + '/scripts' |
|||
if not os.path.exists(dir): |
|||
os.makedirs(dir) |
|||
|
|||
# 这个代码要更新为一个通用代码 |
|||
file_path = dir + '/main.lua' |
|||
|
|||
if os.path.exists(file_path): |
|||
file = codecs.open(file_path, 'r', 'utf-8') |
|||
self.__lua_script = file.read() |
|||
file.close() |
|||
else: |
|||
with codecs.open(file_path, 'w', 'utf-8') as file: |
|||
self.__lua_script = 'This is lua script file' |
|||
file.write(self.__lua_script) |
|||
|
|||
def script(self): |
|||
return self.__lua_script; |
|||
|
|||
def class_name(self): |
|||
return type(self).__name__; |
|||
|
|||
def replace(self, source, param, value): |
|||
return re.sub('{{[\s]*\$' + param + '[\s]*}}', value, source) |
|||
|
|||
# 向lua脚本传递参数变量 |
|||
def set_params_for_lua(self, scripts, params): |
|||
for param in params: |
|||
scripts = self.replace(scripts, param, params[param]) |
|||
return scripts |
|||
|
|||
''' |
|||
-------------------------------------------------------------------------------------- |
|||
本函数用于打开指定的网址。具体的网址、参数、等待就绪的网页元件、等待就绪的翻页元件、 |
|||
headers 等参数默认为空。这些参数的任意一个为空时,则从配置文件中的相关类名项下提取。 |
|||
本函数会调用 lua 主脚本文件,执行页面解析的lua脚本文件。该文件名称通过参数 parser 传递。 |
|||
-------------------------------------------------------------------------------------- |
|||
''' |
|||
def open(self): |
|||
pass |
|||
def open(self, scripts_js, pages=1, url='', params=None, |
|||
wait_for='', page_element='', headers='', annoucement_type=''): |
|||
if url == '': |
|||
url = self.config['class'][self.class_name()]['url'] |
|||
if params == None: |
|||
params = self.config['class'][self.class_name()]['param'] |
|||
if len(params) > 0: |
|||
for param in params: |
|||
url = self.replace(url, param, params[param]) |
|||
|
|||
if wait_for == '': |
|||
wait_for = self.config['class'][self.class_name()]['wait_for'] |
|||
|
|||
if page_element =='': |
|||
page_element = self.config['class'][self.class_name()]['page_element'] |
|||
|
|||
if headers == '': |
|||
headers = self.config['class'][self.class_name()]['headers'] |
|||
scripts = self.script() |
|||
scripts = self.set_params_for_lua(scripts, { |
|||
'pages':str(pages), |
|||
'url':url, |
|||
'wait_for':wait_for, |
|||
'page_element':page_element, |
|||
# 这个解析器要从通过参数传递 |
|||
'scripts_js': scripts_js, |
|||
'announcement_type':annoucement_type |
|||
}) |
|||
|
|||
# print(scripts) |
|||
data = json.dumps({'lua_source':scripts}) |
|||
splash_url = 'http://' + self.config['server'] + ':' + self.config['port'] + '/execute' |
|||
r = HTMLSession().post(splash_url, headers=headers, data=data) |
|||
|
|||
return r |
@ -0,0 +1,75 @@ |
|||
-- 本文件是页面抓取的主入口 |
|||
-- 这里必须采用加载模块的方法,否则好像不能动态加载js文件 |
|||
parser = require('parser') |
|||
|
|||
function main(splash, args) |
|||
pages = {{$pages}} |
|||
scripts_js = '{{$scripts_js}}' |
|||
page_element = '{{$page_element}}' |
|||
wait_for = '{{$wait_for}}' |
|||
announcement_type = '{{$announcement_type}}' |
|||
splash:go('{{$url}}') |
|||
wait_for_element(splash, wait_for) |
|||
wait_for_element(splash, page_element) |
|||
|
|||
-- 设置javascript脚本参数 |
|||
results = {} |
|||
params_js = {} |
|||
params_js['announcement_type'] = announcement_type |
|||
|
|||
-- 将第一页的结果加入返回结果集中 |
|||
result = parser.select(splash, scripts_js, params_js) |
|||
table.insert(results, result) |
|||
|
|||
if pages == 1 then |
|||
return results |
|||
else |
|||
-- 执行翻页动作 |
|||
-- 先页面上的翻页元件(element),然后发送点击事件(click())翻页 |
|||
for i = 2, pages do |
|||
-- 执行翻页脚本 |
|||
-- js 中是javascript脚本,用于获取翻页的元件,并发送click事件 |
|||
js = string.format("document.querySelector('%s').click();", page_element) |
|||
splash:runjs(js) |
|||
|
|||
-- 等待页面加载完成 |
|||
wait_for_element(splash, wait_for) |
|||
wait_for_element(splash, page_element) |
|||
|
|||
-- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新 |
|||
assert(splash:wait(5)) |
|||
result = parser.select(splash, scripts_js, params_js) |
|||
table.insert(results, result) |
|||
end |
|||
return results |
|||
end |
|||
end |
|||
|
|||
function wait_for_element(splash, css, maxwait) |
|||
-- Wait until a selector matches an element |
|||
-- in the page. Return an error if waited more |
|||
-- than maxwait seconds. |
|||
if maxwait == nil then |
|||
maxwait = 10 |
|||
end |
|||
return splash:wait_for_resume(string.format([[ |
|||
function main(splash) { |
|||
var selector = '%s'; |
|||
var maxwait = %s; |
|||
var end = Date.now() + maxwait*1000; |
|||
|
|||
function check() { |
|||
if(document.querySelector(selector)) { |
|||
splash.resume('Element found'); |
|||
} else if(Date.now() >= end) { |
|||
var err = 'Timeout waiting for element'; |
|||
splash.error(err + " " + selector); |
|||
} else { |
|||
setTimeout(check, 200); |
|||
} |
|||
} |
|||
check(); |
|||
} |
|||
]], css, maxwait)) |
|||
end |
|||
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,28 @@ |
|||
-- 文件名为 module.lua |
|||
-- 定义一个名为 module 的模块 |
|||
parser = {} |
|||
|
|||
function set_params(scripts, params_js) |
|||
for param, value in pairs(params_js) do |
|||
scripts = scripts.gsub(scripts, "{{(%s*)$" .. param .. "(%s*)}}", value) |
|||
end |
|||
--scripts = scripts.gsub('123456 aaaa 123456', "[\s\\\]*aaaa\\\[\\\\s\\\]*", 'bbbb') |
|||
return scripts |
|||
end |
|||
|
|||
-- 定义一个函数 |
|||
function parser.select(splash, scripts_js, params_js) |
|||
local file = io.open("/etc/splash/lua_modules/jquery-3.7.1.min.js", "r") |
|||
splash:runjs(file:read('*a')) |
|||
file:close() |
|||
|
|||
file = assert(io.open("/etc/splash/lua_modules/"..scripts_js..".js", "r")) |
|||
scripts = file:read('*a') |
|||
scripts = set_params(scripts, params_js) |
|||
local js = splash:jsfunc(scripts) |
|||
file:close() |
|||
|
|||
return js() |
|||
end |
|||
|
|||
return parser |
@ -0,0 +1,32 @@ |
|||
function () { |
|||
title = ''; |
|||
url = ''; |
|||
updateTime = ''; |
|||
region = ''; |
|||
announcementType = ''; |
|||
results = {}; |
|||
lists = new Array(); |
|||
|
|||
// 取列表的头
|
|||
ul = $('#app > div > div.z_list_vue > div.ant-spin-nested-loading > div > div > div.z_content > div.z_detail_content > div:nth-child(5) > div.ant-spin-nested-loading > div > ul'); |
|||
// 获取列表的第一个元素,获取成功的话,元素封装对象的length = 1
|
|||
li = ul.children('li').first() |
|||
item = {} |
|||
while (li.length == 1) |
|||
{ |
|||
a = li.find('div.ant-list-item-meta > div > h4 > span > a'); |
|||
item.title = $(a.children()['2']).attr('title'); |
|||
item.url = a.attr('href'); |
|||
item.updateTime = $(li.children()[1]).text(); |
|||
item.region = '宁波阳光采购'; |
|||
item.announcementType = '{{$announcement_type}}' |
|||
|
|||
lists.push(item) |
|||
// 取下一个列表元素
|
|||
li = li.next() |
|||
} |
|||
|
|||
results.count = lists.length |
|||
results.lists = lists |
|||
return results |
|||
} |
File diff suppressed because it is too large
File diff suppressed because it is too large
Loading…
Reference in new issue