function Crawl(){ var dataList = {}; for(i=1;#loop();1){ var url = loop()[i]; var response = wb.go (url) if(response !== 0){ html = wb.html; var htmlDoc = string.html( html ) var div_label = htmlDoc.queryEles( tagName = "div" ); var main_label = div_label[1].queryEles( tagName = "main" ); var div1_label = main_label[1].queryEles( tagName = "div" ); var div2_label = div1_label[3].queryEles( tagName = "div" ); var div3_label = div2_label[1].queryEles( tagName = "div" ); var div4_label = div3_label[1].queryEles( tagName = "div" ); var img_label = div4_label[1].queryEles( tagName = "img" ); var video_label = div4_label[1].queryEles( tagName = "video" );
// 整理数据:把图片、视频信息合并成字典,存入 dataList for(k, value in img_label){ // 构造一个字典,包含标题、链接、类型(图片) var imgDict = { "类型": "图片", "标题": value.title, "链接": value.src };
table.push(dataList, imgDict); } for(k, v in video_label){ // 构造一个字典,包含标题、链接、类型(视频) var videoDict = { "类型": "视频", "标题": v.title, "链接": v.src }; table.push(dataList, videoDict); // 加入最终列表 } } } return dataList; }
模数据保存
功能:接收爬取到的结构化数据,按CSV表头顺序写入数据,最终保存为本地CSV文件,并提示保存结果。
var dataList = Crawl(); for(i=1;#dataList;1){ var row = {}; for(k,v in headers){ table.push(row,dataList[i][v]) } csv.push(row) } var save = csv.save('products.csv')
function py_method(){ var pyCode = /** import csv from openpyxl import Workbook from openpyxl.styles import Alignment from openpyxl.utils import get_column_letter