python_ai/baidusearch.py


								#!/usr/bin/env python

								# -*- coding: utf-8 -*-

								# Created by Charles on 2018/10/10

								# Function:


								import sys

								import requests

								from bs4 import BeautifulSoup


								ABSTRACT_MAX_LENGTH = 300    # abstract max length


								user_agents = [

								    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

								    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',

								    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'

								    ' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',

								    'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 '

								    '(KHTML, like Gecko)  QtWeb Internet Browser/3.7 http://www.QtWeb.net',

								    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) '

								    'Chrome/41.0.2228.0 Safari/537.36',

								    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, '

								    'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',

								    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) '

								    'Gecko/20070404 K-Ninja/2.1.3',

								    'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; '

								    'en-US) iNet Browser 4.7',

								    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',

								    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) '

								    'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866'

								]


								# 请求头信息

								HEADERS = {

								    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",

								    "Content-Type": "application/x-www-form-urlencoded",

								    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',

								    "Referer": "https://www.baidu.com/",

								    "Accept-Encoding": "gzip, deflate",

								    "Accept-Language": "zh-CN,zh;q=0.9"

								}


								baidu_host_url = "https://www.baidu.com"

								baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd="


								session = requests.Session()

								session.headers = HEADERS


								def search(keyword, num_results=10, debug=0):

								    """

								    通过关键字进行搜索

								    :param keyword: 关键字

								    :param num_results： 指定返回的结果个数

								    :return: 结果列表

								    """

								    if not keyword:

								        return None


								    list_result = []

								    page = 1


								    # 起始搜索的url

								    next_url = baidu_search_url + keyword


								    # 循环遍历每一页的搜索结果，并返回下一页的url

								    while len(list_result) < num_results:

								        data, next_url = parse_html(next_url, rank_start=len(list_result))

								        if data:

								            list_result += data

								            if debug:

								                print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data)))

								                for d in data:

								                    print(str(d))


								        if not next_url:

								            if debug:

								                print(u"already search the last page。")

								            break

								        page += 1


								    if debug:

								        print("\n---search [{}] finished. total results number={}！".format(keyword, len(list_result)))

								    return list_result[: num_results] if len(list_result) > num_results else list_result


								def parse_html(url, rank_start=0, debug=0):

								    """

								    解析处理结果

								    :param url: 需要抓取的 url

								    :return:  结果列表，下一页的url

								    """

								    try:

								        res = session.get(url=url)

								        res.encoding = "utf-8"

								        root = BeautifulSoup(res.text, "lxml")


								        list_data = []

								        div_contents = root.find("div", id="content_left")

								        for div in div_contents.contents:

								            if type(div) != type(div_contents):

								                continue


								            class_list = div.get("class", [])

								            if not class_list:

								                continue


								            if "c-container" not in class_list:

								                continue


								            title = ''

								            url = ''

								            abstract = ''

								            try:

								                # 遍历所有找到的结果，取得标题和概要内容（50字以内）

								                if "xpath-log" in class_list:

								                    if div.h3:

								                        title = div.h3.text.strip()

								                        url = div.h3.a['href'].strip()

								                    else:

								                        title = div.text.strip().split("\n", 1)[0]

								                        if div.a:

								                            url = div.a['href'].strip()


								                    if div.find("div", class_="c-abstract"):

								                        abstract = div.find("div", class_="c-abstract").text.strip()

								                    elif div.div:

								                        abstract = div.div.text.strip()

								                    else:

								                        abstract = div.text.strip().split("\n", 1)[1].strip()

								                elif "result-op" in class_list:

								                    if div.h3:

								                        title = div.h3.text.strip()

								                        url = div.h3.a['href'].strip()

								                    else:

								                        title = div.text.strip().split("\n", 1)[0]

								                        url = div.a['href'].strip()

								                    if div.find("div", class_="c-abstract"):

								                        abstract = div.find("div", class_="c-abstract").text.strip()

								                    elif div.div:

								                        abstract = div.div.text.strip()

								                    else:

								                        # abstract = div.text.strip()

								                        abstract = div.text.strip().split("\n", 1)[1].strip()

								                else:

								                    if div.get("tpl", "") != "se_com_default":

								                        if div.get("tpl", "") == "se_st_com_abstract":

								                            if len(div.contents) >= 1:

								                                title = div.h3.text.strip()

								                                if div.find("div", class_="c-abstract"):

								                                    abstract = div.find("div", class_="c-abstract").text.strip()

								                                elif div.div:

								                                    abstract = div.div.text.strip()

								                                else:

								                                    abstract = div.text.strip()

								                        else:

								                            if len(div.contents) >= 2:

								                                if div.h3:

								                                    title = div.h3.text.strip()

								                                    url = div.h3.a['href'].strip()

								                                else:

								                                    title = div.contents[0].text.strip()

								                                    url = div.h3.a['href'].strip()

								                                # abstract = div.contents[-1].text

								                                if div.find("div", class_="c-abstract"):

								                                    abstract = div.find("div", class_="c-abstract").text.strip()

								                                elif div.div:

								                                    abstract = div.div.text.strip()

								                                else:

								                                    abstract = div.text.strip()

								                    else:

								                        if div.h3:

								                            title = div.h3.text.strip()

								                            url = div.h3.a['href'].strip()

								                        else:

								                            title = div.contents[0].text.strip()

								                            url = div.h3.a['href'].strip()

								                        if div.find("div", class_="c-abstract"):

								                            abstract = div.find("div", class_="c-abstract").text.strip()

								                        elif div.div:

								                            abstract = div.div.text.strip()

								                        else:

								                            abstract = div.text.strip()

								            except Exception as e:

								                if debug:

								                    print("catch exception duration parsing page html, e={}".format(e))

								                continue


								            if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:

								                abstract = abstract[:ABSTRACT_MAX_LENGTH]


								            rank_start+=1

								            list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start})


								        # 找到下一页按钮

								        next_btn = root.find_all("a", class_="n")


								        # 已经是最后一页了，没有下一页了，此时只返回数据不再获取下一页的链接

								        if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text:

								            return list_data, None


								        next_url = baidu_host_url + next_btn[-1]["href"]

								        return list_data, next_url

								    except Exception as e:

								        if debug:

								            print(u"catch exception duration parsing page html, e：{}".format(e))

								        return None, None


								def run():

								    """

								    主程序入口，支持命令得带参执行或者手动输入关键字

								    :return:

								    """

								    default_keyword = u"长风破浪小武哥"

								    num_results = 10

								    debug = 0


								    prompt = """

								    baidusearch: not enough arguments

								    [0]keyword: keyword what you want to search

								    [1]num_results: number of results

								    [2]debug: debug switch, 0-close, 1-open, default-0

								    eg: baidusearch NBA

								        baidusearch NBA 6

								        baidusearch NBA 8 1

								    """

								    if len(sys.argv) > 3:

								        keyword = sys.argv[1]

								        try:

								            num_results = int(sys.argv[2])

								            debug = int(sys.argv[3])

								        except:

								            pass

								    elif len(sys.argv) > 1:

								        keyword = sys.argv[1]

								    else:

								        print(prompt)

								        keyword = input("please input keyword: ")

								        # sys.exit(1)


								    if not keyword:

								        keyword = default_keyword


								    print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results))

								    results = search(keyword, num_results=num_results, debug=debug)


								    if isinstance(results, list):

								        print("search results：(total[{}]items.)".format(len(results)))

								        for res in results:

								            print("{}. {}\n   {}\n   {}".format(res['rank'], res["title"], res["abstract"], res["url"]))

								    else:

								        print("start search: [{}] failed.".format(keyword))


								if __name__ == '__main__':

								    run()