#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by Charles on 2018/10/10
# Function:

import sys
import requests
from bs4 import BeautifulSoup


ABSTRACT_MAX_LENGTH = 300    # abstract max length

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
    ' Ubuntu Chromium/49.0.2623.108 Chrome/49.0.2623.108 Safari/537.36',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; pt-BR) AppleWebKit/533.3 '
    '(KHTML, like Gecko)  QtWeb Internet Browser/3.7 http://www.QtWeb.net',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/41.0.2228.0 Safari/537.36',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, '
    'like Gecko) ChromePlus/4.0.222.3 Chrome/4.0.222.3 Safari/532.2',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.4pre) '
    'Gecko/20070404 K-Ninja/2.1.3',
    'Mozilla/5.0 (Future Star Technologies Corp.; Star-Blade OS; x86_64; U; '
    'en-US) iNet Browser 4.7',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) '
    'Gecko/20080414 Firefox/2.0.0.13 Pogo/2.0.0.13.6866'
]

# 请求头信息
HEADERS = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Content-Type": "application/x-www-form-urlencoded",
    "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    "Referer": "https://www.baidu.com/",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9"
}

baidu_host_url = "https://www.baidu.com"
baidu_search_url = "https://www.baidu.com/s?ie=utf-8&tn=baidu&wd="

session = requests.Session()
session.headers = HEADERS


def search(keyword, num_results=10, debug=0):
    """
    通过关键字进行搜索
    :param keyword: 关键字
    :param num_results： 指定返回的结果个数
    :return: 结果列表
    """
    if not keyword:
        return None

    list_result = []
    page = 1

    # 起始搜索的url
    next_url = baidu_search_url + keyword

    # 循环遍历每一页的搜索结果，并返回下一页的url
    while len(list_result) < num_results:
        data, next_url = parse_html(next_url, rank_start=len(list_result))
        if data:
            list_result += data
            if debug:
                print("---searching[{}], finish parsing page {}, results number={}: ".format(keyword, page, len(data)))
                for d in data:
                    print(str(d))

        if not next_url:
            if debug:
                print(u"already search the last page。")
            break
        page += 1

    if debug:
        print("\n---search [{}] finished. total results number={}！".format(keyword, len(list_result)))
    return list_result[: num_results] if len(list_result) > num_results else list_result


def parse_html(url, rank_start=0, debug=0):
    """
    解析处理结果
    :param url: 需要抓取的 url
    :return:  结果列表，下一页的url
    """
    try:
        res = session.get(url=url)
        res.encoding = "utf-8"
        root = BeautifulSoup(res.text, "lxml")

        list_data = []
        div_contents = root.find("div", id="content_left")
        for div in div_contents.contents:
            if type(div) != type(div_contents):
                continue

            class_list = div.get("class", [])
            if not class_list:
                continue

            if "c-container" not in class_list:
                continue

            title = ''
            url = ''
            abstract = ''
            try:
                # 遍历所有找到的结果，取得标题和概要内容（50字以内）
                if "xpath-log" in class_list:
                    if div.h3:
                        title = div.h3.text.strip()
                        url = div.h3.a['href'].strip()
                    else:
                        title = div.text.strip().split("\n", 1)[0]
                        if div.a:
                            url = div.a['href'].strip()

                    if div.find("div", class_="c-abstract"):
                        abstract = div.find("div", class_="c-abstract").text.strip()
                    elif div.div:
                        abstract = div.div.text.strip()
                    else:
                        abstract = div.text.strip().split("\n", 1)[1].strip()
                elif "result-op" in class_list:
                    if div.h3:
                        title = div.h3.text.strip()
                        url = div.h3.a['href'].strip()
                    else:
                        title = div.text.strip().split("\n", 1)[0]
                        url = div.a['href'].strip()
                    if div.find("div", class_="c-abstract"):
                        abstract = div.find("div", class_="c-abstract").text.strip()
                    elif div.div:
                        abstract = div.div.text.strip()
                    else:
                        # abstract = div.text.strip()
                        abstract = div.text.strip().split("\n", 1)[1].strip()
                else:
                    if div.get("tpl", "") != "se_com_default":
                        if div.get("tpl", "") == "se_st_com_abstract":
                            if len(div.contents) >= 1:
                                title = div.h3.text.strip()
                                if div.find("div", class_="c-abstract"):
                                    abstract = div.find("div", class_="c-abstract").text.strip()
                                elif div.div:
                                    abstract = div.div.text.strip()
                                else:
                                    abstract = div.text.strip()
                        else:
                            if len(div.contents) >= 2:
                                if div.h3:
                                    title = div.h3.text.strip()
                                    url = div.h3.a['href'].strip()
                                else:
                                    title = div.contents[0].text.strip()
                                    url = div.h3.a['href'].strip()
                                # abstract = div.contents[-1].text
                                if div.find("div", class_="c-abstract"):
                                    abstract = div.find("div", class_="c-abstract").text.strip()
                                elif div.div:
                                    abstract = div.div.text.strip()
                                else:
                                    abstract = div.text.strip()
                    else:
                        if div.h3:
                            title = div.h3.text.strip()
                            url = div.h3.a['href'].strip()
                        else:
                            title = div.contents[0].text.strip()
                            url = div.h3.a['href'].strip()
                        if div.find("div", class_="c-abstract"):
                            abstract = div.find("div", class_="c-abstract").text.strip()
                        elif div.div:
                            abstract = div.div.text.strip()
                        else:
                            abstract = div.text.strip()
            except Exception as e:
                if debug:
                    print("catch exception duration parsing page html, e={}".format(e))
                continue

            if ABSTRACT_MAX_LENGTH and len(abstract) > ABSTRACT_MAX_LENGTH:
                abstract = abstract[:ABSTRACT_MAX_LENGTH]

            rank_start+=1
            list_data.append({"title": title, "abstract": abstract, "url": url, "rank": rank_start})


        # 找到下一页按钮
        next_btn = root.find_all("a", class_="n")

        # 已经是最后一页了，没有下一页了，此时只返回数据不再获取下一页的链接
        if len(next_btn) <= 0 or u"上一页" in next_btn[-1].text:
            return list_data, None

        next_url = baidu_host_url + next_btn[-1]["href"]
        return list_data, next_url
    except Exception as e:
        if debug:
            print(u"catch exception duration parsing page html, e：{}".format(e))
        return None, None


def run():
    """
    主程序入口，支持命令得带参执行或者手动输入关键字
    :return:
    """
    default_keyword = u"长风破浪小武哥"
    num_results = 10
    debug = 0

    prompt = """
    baidusearch: not enough arguments
    [0]keyword: keyword what you want to search
    [1]num_results: number of results
    [2]debug: debug switch, 0-close, 1-open, default-0
    eg: baidusearch NBA
        baidusearch NBA 6
        baidusearch NBA 8 1
    """
    if len(sys.argv) > 3:
        keyword = sys.argv[1]
        try:
            num_results = int(sys.argv[2])
            debug = int(sys.argv[3])
        except:
            pass
    elif len(sys.argv) > 1:
        keyword = sys.argv[1]
    else:
        print(prompt)
        keyword = input("please input keyword: ")
        # sys.exit(1)

    if not keyword:
        keyword = default_keyword

    print("---start search: [{}], expected number of results:[{}].".format(keyword, num_results))
    results = search(keyword, num_results=num_results, debug=debug)

    if isinstance(results, list):
        print("search results：(total[{}]items.)".format(len(results)))
        for res in results:
            print("{}. {}\n   {}\n   {}".format(res['rank'], res["title"], res["abstract"], res["url"]))
    else:
        print("start search: [{}] failed.".format(keyword))


if __name__ == '__main__':
    run()