You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
258 lines
9.8 KiB
258 lines
9.8 KiB
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# Created by Charles on 2018/10/10
|
|
# Function:
|
|
|
|
import sys
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
ABSTRACT_MAX_LENGTH = 300 # abstract max length
|
|
|
|
user_agents = [
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
|
'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
|
' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
|
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 '
|
|
'(KHTML, like Gecko) QtWeb Internet Browser/3.7 http://www.QtWeb.net',
|
|
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) '
|
|
'Chrome/41.0.2228.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, '
|
|
'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',
|
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) '
|
|
'Gecko/20070404 K-Ninja/2.1.3',
|
|
'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; '
|
|
'en-US) iNet Browser 4.7',
|
|
'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
|
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) '
|
|
'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866'
|
|
]
|
|
|
|
# 请求头信息
|
|
HEADERS = {
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
|
"Content-Type": "application/x-www-form-urlencoded",
|
|
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
|
|
"Referer": "https://www.baidu.com/",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Accept-Language": "zh-CN,zh;q=0.9"
|
|
}
|
|
|
|
baidu_host_url = "https://www.baidu.com"
|
|
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd="
|
|
|
|
session = requests.Session()
|
|
session.headers = HEADERS
|
|
|
|
|
|
def search(keyword, num_results=10, debug=0):
|
|
"""
|
|
通过关键字进行搜索
|
|
:param keyword: 关键字
|
|
:param num_results: 指定返回的结果个数
|
|
:return: 结果列表
|
|
"""
|
|
if not keyword:
|
|
return None
|
|
|
|
list_result = []
|
|
page = 1
|
|
|
|
# 起始搜索的url
|
|
next_url = baidu_search_url + keyword
|
|
|
|
# 循环遍历每一页的搜索结果,并返回下一页的url
|
|
while len(list_result) < num_results:
|
|
data, next_url = parse_html(next_url, rank_start=len(list_result))
|
|
if data:
|
|
list_result += data
|
|
if debug:
|
|
print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data)))
|
|
for d in data:
|
|
print(str(d))
|
|
|
|
if not next_url:
|
|
if debug:
|
|
print(u"already search the last page。")
|
|
break
|
|
page += 1
|
|
|
|
if debug:
|
|
print("\n---search [{}] finished. total results number={}!".format(keyword, len(list_result)))
|
|
return list_result[: num_results] if len(list_result) > num_results else list_result
|
|
|
|
|
|
def parse_html(url, rank_start=0, debug=0):
|
|
"""
|
|
解析处理结果
|
|
:param url: 需要抓取的 url
|
|
:return: 结果列表,下一页的url
|
|
"""
|
|
try:
|
|
res = session.get(url=url)
|
|
res.encoding = "utf-8"
|
|
root = BeautifulSoup(res.text, "lxml")
|
|
|
|
list_data = []
|
|
div_contents = root.find("div", id="content_left")
|
|
for div in div_contents.contents:
|
|
if type(div) != type(div_contents):
|
|
continue
|
|
|
|
class_list = div.get("class", [])
|
|
if not class_list:
|
|
continue
|
|
|
|
if "c-container" not in class_list:
|
|
continue
|
|
|
|
title = ''
|
|
url = ''
|
|
abstract = ''
|
|
try:
|
|
# 遍历所有找到的结果,取得标题和概要内容(50字以内)
|
|
if "xpath-log" in class_list:
|
|
if div.h3:
|
|
title = div.h3.text.strip()
|
|
url = div.h3.a['href'].strip()
|
|
else:
|
|
title = div.text.strip().split("\n", 1)[0]
|
|
if div.a:
|
|
url = div.a['href'].strip()
|
|
|
|
if div.find("div", class_="c-abstract"):
|
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
|
elif div.div:
|
|
abstract = div.div.text.strip()
|
|
else:
|
|
abstract = div.text.strip().split("\n", 1)[1].strip()
|
|
elif "result-op" in class_list:
|
|
if div.h3:
|
|
title = div.h3.text.strip()
|
|
url = div.h3.a['href'].strip()
|
|
else:
|
|
title = div.text.strip().split("\n", 1)[0]
|
|
url = div.a['href'].strip()
|
|
if div.find("div", class_="c-abstract"):
|
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
|
elif div.div:
|
|
abstract = div.div.text.strip()
|
|
else:
|
|
# abstract = div.text.strip()
|
|
abstract = div.text.strip().split("\n", 1)[1].strip()
|
|
else:
|
|
if div.get("tpl", "") != "se_com_default":
|
|
if div.get("tpl", "") == "se_st_com_abstract":
|
|
if len(div.contents) >= 1:
|
|
title = div.h3.text.strip()
|
|
if div.find("div", class_="c-abstract"):
|
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
|
elif div.div:
|
|
abstract = div.div.text.strip()
|
|
else:
|
|
abstract = div.text.strip()
|
|
else:
|
|
if len(div.contents) >= 2:
|
|
if div.h3:
|
|
title = div.h3.text.strip()
|
|
url = div.h3.a['href'].strip()
|
|
else:
|
|
title = div.contents[0].text.strip()
|
|
url = div.h3.a['href'].strip()
|
|
# abstract = div.contents[-1].text
|
|
if div.find("div", class_="c-abstract"):
|
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
|
elif div.div:
|
|
abstract = div.div.text.strip()
|
|
else:
|
|
abstract = div.text.strip()
|
|
else:
|
|
if div.h3:
|
|
title = div.h3.text.strip()
|
|
url = div.h3.a['href'].strip()
|
|
else:
|
|
title = div.contents[0].text.strip()
|
|
url = div.h3.a['href'].strip()
|
|
if div.find("div", class_="c-abstract"):
|
|
abstract = div.find("div", class_="c-abstract").text.strip()
|
|
elif div.div:
|
|
abstract = div.div.text.strip()
|
|
else:
|
|
abstract = div.text.strip()
|
|
except Exception as e:
|
|
if debug:
|
|
print("catch exception duration parsing page html, e={}".format(e))
|
|
continue
|
|
|
|
if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
|
|
abstract = abstract[:ABSTRACT_MAX_LENGTH]
|
|
|
|
rank_start+=1
|
|
list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start})
|
|
|
|
|
|
# 找到下一页按钮
|
|
next_btn = root.find_all("a", class_="n")
|
|
|
|
# 已经是最后一页了,没有下一页了,此时只返回数据不再获取下一页的链接
|
|
if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text:
|
|
return list_data, None
|
|
|
|
next_url = baidu_host_url + next_btn[-1]["href"]
|
|
return list_data, next_url
|
|
except Exception as e:
|
|
if debug:
|
|
print(u"catch exception duration parsing page html, e:{}".format(e))
|
|
return None, None
|
|
|
|
|
|
def run():
|
|
"""
|
|
主程序入口,支持命令得带参执行或者手动输入关键字
|
|
:return:
|
|
"""
|
|
default_keyword = u"长风破浪小武哥"
|
|
num_results = 10
|
|
debug = 0
|
|
|
|
prompt = """
|
|
baidusearch: not enough arguments
|
|
[0]keyword: keyword what you want to search
|
|
[1]num_results: number of results
|
|
[2]debug: debug switch, 0-close, 1-open, default-0
|
|
eg: baidusearch NBA
|
|
baidusearch NBA 6
|
|
baidusearch NBA 8 1
|
|
"""
|
|
if len(sys.argv) > 3:
|
|
keyword = sys.argv[1]
|
|
try:
|
|
num_results = int(sys.argv[2])
|
|
debug = int(sys.argv[3])
|
|
except:
|
|
pass
|
|
elif len(sys.argv) > 1:
|
|
keyword = sys.argv[1]
|
|
else:
|
|
print(prompt)
|
|
keyword = input("please input keyword: ")
|
|
# sys.exit(1)
|
|
|
|
if not keyword:
|
|
keyword = default_keyword
|
|
|
|
print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results))
|
|
results = search(keyword, num_results=num_results, debug=debug)
|
|
|
|
if isinstance(results, list):
|
|
print("search results:(total[{}]items.)".format(len(results)))
|
|
for res in results:
|
|
print("{}. {}\n {}\n {}".format(res['rank'], res["title"], res["abstract"], res["url"]))
|
|
else:
|
|
print("start search: [{}] failed.".format(keyword))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
run()
|
|
|