「HTML&JS&CSS」我用JS写了一个可以爬取京东商品信息和评价的脚本,分享下源码

JavaScript xiaobihu 发表于 3 月前

话不多说,直接上源码:

var keyword = "d3.js";//@input (keyword, 查询关键字, 爬取该关键字搜索出来的京东商品)
var comment_count = 100;//@input (comment_count, 爬取的评论数, 最多爬取多少条评论)

var page_count = comment_count / 10;
keyword = keyword.trim();
var scanUrls = [];
scanUrls.push("http://search.jd.com/Search?keyword="+keyword.replace(/ /g, "+")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=1&s=1&click=0");
var helperUrlRegexes = [];
helperUrlRegexes.push("http://search.jd.com/Search\?keyword="+keyword.replace(/ /g, "+").replace(/./g, ".")+"&enc=utf-8&qrst=1&rt=1&stop=1&book=y&vt=2&page=\d+&s=1&click=0");

var configs = {
domains: ["search.jd.com","item.jd.com","club.jd.com"],
scanUrls: scanUrls,
contentUrlRegexes: ["http://item.jd.com/\d+.html"],
helperUrlRegexes: helperUrlRegexes,
fields: [
{
// 第一个抽取项
name: "title",
selector: "//div[@id ='name']/h1",
required: true
},
{
// 第一个抽取项
name: "productid",
selector: "//div[contains(@class ,'fl')]/span[2]",
required: true
},
{
name: "comments",
selector: "//div[@id ='comment-pages']/span",
repeated: true,
children: [
{
name: "page",
selector: "//text()"
},
{
name: "comments",
sourceType: SourceType.AttachedUrl,
attachedUrl: "http://club.jd.com/productpage/p-{$.productid}-s-0-t-3-p-{page}.html", selectorType: SelectorType.JsonPath, selector: "$.comments",
repeated: true,
children:[
{
name: "com_content",
selectorType: SelectorType.JsonPath,
selector: "$.content" }, { name: "com_nickname", selectorType: SelectorType.JsonPath, selector: "$.nickname"
}
]
}
]
}
]
};
configs.afterDownloadPage = function(page, site) {
var matches = /item.jd.com\/(\d+).html/.exec(page.url);
if (!matches) return page;
var commentUrl = "http://club.jd.com/productpage/p-"+matches[1]+"-s-0-t-3-p-0.html";
var result = site.requestUrl(commentUrl);
var data = JSON.parse(result);
var commentCount = data.productCommentSummary.commentCount;
var pages = commentCount / 10;
if (pages > page_count) pages = page_count;
var pageHtml = "<div id=\"comment-pages\">";
for (var i = 0; i < pages; i) {
pageHtml += "<span>" + i + "</span>";
}
pageHtml += "</div>";
var index = page.raw.indexOf("</body>");
page.raw = page.raw.substring(0, index) + pageHtml + page.raw.substring(index);
return page;
};
configs.onProcessHelperUrl = function(url, content, site){
if(!content.indexOf("抱歉,没有找到")){
var currentPage = parseInt(url.substring(url.indexOf("&page=") + 6));
if(currentPage === 0){
currentPage = 1;
}
var page = currentPage + 2;
var nextUrl = url.replace("&page=" + currentPage, "&page=" + page);
site.addUrl(nextUrl);
}
return true;
};
configs.afterExtractPage = function(page, data) {
if (data.comments === null || data.comments === undefined) return data;
var comments = [];
for (var i = 0; i < data.comments.length; i
) {
var p = data.comments;
for (var j = 0; j < p.comments.length; j++) {
comments.push(p.comments[j]);
}
}
data.comments = comments;
return data;
};
var crawler = new Crawler(configs);
crawler.start();

代码运行方法:https://github.com/ShenJianShou/crawler_samples/blob/master/%E5%A6%82%E4%BD%95%E6%89%A7%E8%A1%8C%E6%A0%B7%E4%BE%8B%E4%BB%A3%E7%A0%81.txt

暂无回复,说出你的观点吧
登录后即可参与回复