-- 本文件是页面抓取的主入口 -- 这里必须采用加载模块的方法,否则好像不能动态加载js文件 parser = require('parser') function main(splash, args) pages = {{$pages}} scripts_js = '{{$scripts_js}}' page_element = '{{$page_element}}' wait_for = '{{$wait_for}}' announcement_type = '{{$announcement_type}}' splash:go('{{$url}}') wait_for_element(splash, wait_for) wait_for_element(splash, page_element) -- 设置javascript脚本参数 results = {} params_js = {} params_js['announcement_type'] = announcement_type -- 将第一页的结果加入返回结果集中 result = parser.select(splash, scripts_js, params_js) table.insert(results, result) if pages == 1 then return results else -- 执行翻页动作 -- 先页面上的翻页元件(element),然后发送点击事件(click())翻页 for i = 2, pages do -- 执行翻页脚本 -- js 中是javascript脚本,用于获取翻页的元件,并发送click事件 js = string.format("document.querySelector('%s').click();", page_element) splash:runjs(js) -- 等待页面加载完成 wait_for_element(splash, wait_for) wait_for_element(splash, page_element) -- 这个地方看来必须加上延时,否则页面加载不完全,可能还没有完成页面更新 assert(splash:wait(5)) result = parser.select(splash, scripts_js, params_js) table.insert(results, result) end return results end end function wait_for_element(splash, css, maxwait) -- Wait until a selector matches an element -- in the page. Return an error if waited more -- than maxwait seconds. if maxwait == nil then maxwait = 10 end return splash:wait_for_resume(string.format([[ function main(splash) { var selector = '%s'; var maxwait = %s; var end = Date.now() + maxwait*1000; function check() { if(document.querySelector(selector)) { splash.resume('Element found'); } else if(Date.now() >= end) { var err = 'Timeout waiting for element'; splash.error(err + " " + selector); } else { setTimeout(check, 200); } } check(); } ]], css, maxwait)) end