加拿大即时新闻_多伦多即时新闻

var configs = {
domains: [“info.51.ca”],
scanUrls: [“http://info.51.ca/news/realtime”],
contentUrlRegexes: [“http://info//.51//.ca/news/realtime///d{4}-//d{1,2}///d+.html”],
helperUrlRegexes: [“http://info//.51//.ca/news/realtime/list-//d+//.html”],
overseas: true,
fields: [
{
name: “article_title”,
selector: “//*[@id=’article-main’]/header/h1”,
required: true //是否不能为空
},

{
name: “article_content”,
selector: “//*[@id=’arcbody’]”,
required: true
},

{
name: “article_disget”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_disget’]”
},
{
name: “article_thumbnail”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_thumbnail’]”
}
]
};

configs.afterExtractField = function(fieldName, data, page) {
if (fieldName == “article_publish_time”) {
var timestamp = Date.parse(data);
return isNaN(timestamp) ? “0” : timestamp/1000 + “”;
}
return data;
};

configs.onProcessHelperPage = function(page, content, site) {

var article_disget = extractList(content, “/html/body/div[4]/section/div[2]/div[1]/div[2]/p/a”);
var article_thumbnail = extractList(content, “/html/body/div[4]/section/div[2]/div[1]/div[1]/a/img”);
var ext_html_str = ‘<p id=”article_disget”>’+article_disget+'</p>’+'<p id=”article_thumbnail”>’+article_thumbnail+'</p>’;

var options = {
method: “get”,
contextData: ext_html_str
};

var links = extractList(page.raw,”//div[contains(@class,’textbox’)]/h3/a/@href”);

for(var i = 0, n = links.length; i < n; i++){

var link = ‘http://info.51.ca’+links[i];
site.addUrl(link, options);

return false;
}
};

var crawler = new Crawler(configs);
crawler.start();

蒙城华人网:
var configs = {
domains: [“sinoquebec.com”],
scanUrls: [“http://www.sinoquebec.com/portal.php?mod=list&catid=1”],
contentUrlRegexes: [“http://www//.sinoquebec//.com/portal//.php//?mod=view&aid=//d+”],
helperUrlRegexes: [“http://www//.sinoquebec//.com/portal//.php//?mod=list&catid=1&page=//d+”],
overseas: true,
fields: [
{
name: “article_title”,
selector: “//*[@class=’ph’]”,
required: true //是否不能为空
},

{
name: “article_content”,
selector: “//*[@id=’article_content’]”,
required: true
},
{
name: “article_disget”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_disget’]”
},
{
name: “article_thumbnail”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_thumbnail’]”
}

]
};

configs.afterExtractField = function(fieldName, data, page) {
if (fieldName == “article_publish_time”) {
var timestamp = Date.parse(data);
return isNaN(timestamp) ? “0” : timestamp/1000 + “”;
}
return data;
};

configs.onProcessHelperPage = function(page, content, site) {

var article_disget = extractList(content, “//*[@id=’ct’]/div[1]/div[3]/div[3]/dl[1]/div[1]/dd[1]/text()”);
var article_thumbnail = extractList(content, “//*[@id=’ct’]/div[1]/div[3]/div[3]/dl[1]/div[2]/a/img”);
var ext_html_str = ‘<p id=”article_disget”>’+article_disget+'</p>’+'<p id=”article_thumbnail”>’+article_thumbnail+'</p>’;

var options = {
method: “get”,
contextData: ext_html_str
};

var links = extractList(page.raw,”//*[@class=’zn-pic’]/a/@href”);

for(var i = 0, n = links.length; i < n; i++){

var link = ‘http://www.sinoquebec.com/’+links[i];
site.addUrl(link, options);

return false;
}
};
var crawler = new Crawler(configs);
crawler.start();
在蒙城 :
var configs = {
domains: [“zaimengcheng.com”],
scanUrls: [“http://zaimengcheng.com/forum-83-1.html”],
contentUrlRegexes: [“http://zaimengcheng//.com/forum//.php//?mod=viewthread&tid=//d+//&extra=page%3D1&from=portal”],
helperUrlRegexes: [“http://zaimengcheng//.com/forum//.php//?mod=forumdisplay&fid=83&page=//d+”],
overseas: true,
fields: [
{
name: “article_title”,
selector: “//*[@class=’xh-highlight’]”,
required: true //是否不能为空
},

{
name: “article_content”,
selector: “//*[@class=’pcb’]”,
required: true
},
{
name: “article_disget”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_disget’]”
}
]
};

configs.afterExtractField = function(fieldName, data, page) {
if (fieldName == “article_publish_time”) {
var timestamp = Date.parse(data);
return isNaN(timestamp) ? “0” : timestamp/1000 + “”;
}
return data;
};

configs.onProcessHelperPage = function(page, content, site) {

var article_disget = extractList(content, “//*[@id=’ct’]/div[1]/div[3]/div[3]/dl[1]/div[1]/dd[1]/text()”);
var ext_html_str = ‘<p id=”article_disget”>’+article_disget+'</p>’;

var options = {
method: “get”,
contextData: ext_html_str
};

var links = extractList(page.raw,”//*[@id=’normalthread_209225′]/tr/th/a[2]/@href”);

for(var i = 0, n = links.length; i < n; i++){

var link = ‘http://www.sinoquebec.com/’+links[i];
site.addUrl(link, options);

return false;
}
};
var crawler = new Crawler(configs);
crawler.start();

约克论坛:

var configs = {
domains: [“news.yorkbbs.ca”],
scanUrls: [“http://news.yorkbbs.ca/local/”],
contentUrlRegexes: [“http://news//.yorkbbs//.ca/local///d{4}-//d{1,2}///d+.html”],
helperUrlRegexes: [“http://news//.yorkbbs//.ca/local/index_//d+//.html”],
overseas: true,
fields: [
{
name: “article_title”,
selector: “//*[@class=’article-content’]//h1”,
required: true //是否不能为空
},

{
name: “article_content”,
selector: “//*[@class=’article-main’]”,
required: true
},
{
name: “article_disget”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_disget’]”
},
{
name: “article_thumbnail”,
sourceType: SourceType.UrlContext,
selector: “//*[@id=’article_thumbnail’]”
}
]
};

configs.afterExtractField = function(fieldName, data, page) {
if (fieldName == “article_publish_time”) {
var timestamp = Date.parse(data);
return isNaN(timestamp) ? “0” : timestamp/1000 + “”;
}
return data;
};
configs.onProcessHelperPage = function(page, content, site) {

var article_disget = extractList(content, “//*[@class=’newslist’]/dl[1]/dd/div/p/text()”);
var article_thumbnail = extractList(content, “/html/body/div[6]/div/section/dl[1]/dd/a/div/img”);
var ext_html_str = ‘<p id=”article_disget”>’+article_disget+'</p>’+'<p id=”article_thumbnail”>’+article_thumbnail+'</p>’;

var options = {
method: “get”,
contextData: ext_html_str
};

var links = extractList(page.raw,”//*[@class=’newslist’]/dl/dt/a/@href”);

for(var i = 0, n = links.length; i < n; i++){

site.addUrl(links[i], options);
return false;
}
};

var crawler = new Crawler(configs);
crawler.start();

Tagged:

发表评论

电子邮件地址不会被公开。 必填项已用*标注