Skip to content

Commit

Permalink
added digitimes, chinatimes, xfastest
Browse files Browse the repository at this point in the history
  • Loading branch information
lostmypillow committed Aug 9, 2024
1 parent b373dd8 commit 885d3a1
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 11 deletions.
3 changes: 1 addition & 2 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ const port = 3001;
app.use(cors());

app.get("/test", async (req, res) => {
var data = await scrapeContent(decodeURI(req.query.url));
res.json({ data });
res.json(await scrapeContent(decodeURI(req.query.url)));
});

app.listen(port, () => {
Expand Down
2 changes: 1 addition & 1 deletion example.http
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ GET http://localhost:3000/test?url=https%3A%2F%2Fmoney.udn.com%2Fmoney%2Fstory%2


###
GET http://localhost:3000/test?url=https%3A%2F%2Fudn.com%2Fnews%2Fstory%2F7240%2F8145957 HTTP/1.1
GET http://localhost:3001/test?url=https%3A%2F%2Fwww.xfastest.com%2Fthread-290981-1-1.html HTTP/1.1


###
Expand Down
108 changes: 100 additions & 8 deletions lib/scrapeContent.js
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ async function scrapeContent(link) {
!text.startsWith("googletag") &&
!text.startsWith("▲") &&
!text.startsWith("訂閱手機王,快速掌握") &&
!text.startsWith("現在,你也可以同步追蹤")
!text.startsWith("現在,你也可以同步追蹤") &&
!text.includes("訂閱追蹤") &&
!text.includes("手機王網友")
);
content = textArray;
break;
Expand Down Expand Up @@ -193,7 +195,10 @@ async function scrapeContent(link) {
const udnresult = udnmatch ? udnmatch[1] : "";

date_source_author =
$("time.article-content__time").text().replace(/\//g, "-").slice(0, 10) +
$("time.article-content__time")
.text()
.replace(/\//g, "-")
.slice(0, 10) +
" / 經濟日報 / " +
$("a[href^='/news/reporter/']").text();
content = [];
Expand All @@ -203,31 +208,118 @@ async function scrapeContent(link) {
}
});
break;
///////

///////

////digitimes
case link.includes("digitimes"):
title = $("h1.news-title").text();
date_source_author =
$("time").text() + " / 電子時報 / " + $("font").first().text();
content = [];
break;
/////

/////chinatimes
case link.includes("chinatimes"):
title = $("h1.article-title").text();
date_source_author =
$("span.date").first().text() +
" / " +
$("div.source").text() +
" / " +
$("div.author").text();

content = [];
$("div.article-body p").each((index, element) => {
if (!$(element).text().trim() == "") {
content.push($(element).text().trim());
}
});
break;
//////

case link.includes("ctimes"):
break;

///////kocpc
case link.includes("kocpc"):
title = $("h1.jeg_post_title").text();
date_source_author =
$('a[href="' + link + '"]')
.text()
.replace(/(\d{4}) 年 (\d{2}) 月 (\d{2}) 日/, "$1-$2-$3") +
" / 電腦王阿達 / " +
$("div.jeg_meta_author a").text();
//Zi_ad_ar_iR
content = [];
$("div.Zi_ad_ar_iR p").each((index, element) => {
if (
!$(element).text().trim() == "" &&
!$(element).text().startsWith("<img")
) {
content.push($(element).text().trim());
}
});
//kocpc not done
break;
//////

case link.includes("3c.ltn"):
title = ""
date_source_author = ""
content = []
break;

case link.includes("ec.ltn"):
break;
case link.includes("ltn"):
break;


//////XFastest
case link.includes("xfastest"):
const formattedDate = $('span[title^="202"]')
.attr("title")
.replace(
/(\d{4})-(\d{1,2})-(\d{1,2}) .*/,
(match, year, month, day) => {
// Pad month and day with leading zeros if necessary
return `${year}-${month.padStart(2, "0")}-${day.padStart(2, "0")}`;
}
);
title = $("span#thread_subject").text();
date_source_author =
formattedDate +
" / XFastest / " +
$("a[href^='space-uid']").last().text();
content = [];
$('td[id^="post"]').each((index, element) => {
// Get innerHTML of the selected <td> element
const innerHTML = $(element).html();

// Split the innerHTML by <br></br> tag
content = innerHTML
.split(/<\/?[^>]+>/)
.map((item) => item.trim()) // Trim all elements
.filter((item) => item !== "")
.filter((item) => !item.includes("&nbsp;"))
.filter((item) => item !== "上傳")
.filter((item) => item !== "消息來源")
.filter((item) => !item.includes("jpg"))
.filter((item) => !item.startsWith("("))
.filter((item) => item !== "下載附件")
.filter((item) => item !== "保存到相冊");
});

break;
//////

case link.includes("cnyes"):
break;

case link.includes("moneydj"):
break;
case link.includes("investor"):
break;

case link.includes("investor"):
break;
}

return { title, date_source_author, link, content };
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"license": "ISC",
"packageManager": "[email protected]+sha512.140036830124618d624a2187b50d04289d5a087f326c9edfc0ccd733d76c4f52c3a313d4fc148794a2a9d81553016004e6742e8cf850670268a7387fc220c903",
"dependencies": {
"cheerio": "1.0.0-rc.12",
"cors": "^2.8.5",
"docx": "^8.5.0",
"express": "^4.19.2",
Expand Down

0 comments on commit 885d3a1

Please sign in to comment.