diff --git a/README.md b/README.md index 03269ba..8b470c5 100644 --- a/README.md +++ b/README.md @@ -7,24 +7,35 @@ The [spider](https://github.com/spider-rs/spider) project ported to nodejs via n 1. `npm i @spider-rs/spider-rs --save` ```ts -import { crawl } from '@spider-rs/spider-rs' +import { Website } from '@spider-rs/spider-rs' -// gather all the links found in a website fast concurrently. -const { links, pages } = await crawl("https://rsseau.fr"); +const website = new Website("https://rsseau.fr"); +await website.crawl(); +console.log(website.getLinks()) ``` -## Development +Collect the resource. -Install the napi cli `npm i @napi-rs/cli --global`. +```ts +import { Website } from '@spider-rs/spider-rs' -1. `yarn build:test` +const website = new Website("https://rsseau.fr"); +await website.scrape(); +console.log(website.getPages()) +``` -### TODO: Full Spider Port +Use the crawl shortcut to get the page content and url. -Port the modules to be used via nodejs to adhere spider interface. +```ts +import { crawl } from '@spider-rs/spider-rs' -A full port would require FromNapi support on the following modules. +const { links, pages } = new crawl("https://rsseau.fr"); +console.log(pages) +``` + + +## Development + +Install the napi cli `npm i @napi-rs/cli --global`. -- compact_str -- case_insensitive_str -- small_vec \ No newline at end of file +1. `yarn build:test` \ No newline at end of file diff --git a/__test__/index.spec.ts b/__test__/index.spec.ts index 7dbd348..8f827e5 100644 --- a/__test__/index.spec.ts +++ b/__test__/index.spec.ts @@ -1,10 +1,24 @@ import test from 'ava' -import { crawl } from '../index.js' +import { crawl, Website } from '../index.js' test('crawl native', async (t) => { const { links, pages } = await crawl("https://rsseau.fr"); t.assert(links.length > 1, "should be more than one link") t.assert(pages.length > 1, "should be more than one page") -}) \ No newline at end of file +}) + +test('new website native', async (t) => { + const website = new Website("https://rsseau.fr"); + await website.crawl(); + + t.assert(website.getLinks().length > 1, "should be more than one link") +}) + +test('new website scrape native', async (t) => { + const website = new Website("https://rsseau.fr"); + await website.scrape(); + + t.assert(website.getPages().length > 1, "should be more than one page") +}) diff --git a/index.d.ts b/index.d.ts index e987c67..efd1161 100644 --- a/index.d.ts +++ b/index.d.ts @@ -11,10 +11,22 @@ export interface Page { content: string } /** crawl a website gathering all links to array */ -export function crawl(n: string): Promise -export class Website { +export function crawl(url: string): Promise +/** website main data from rust to node */ +export class NWebsite { /** all of the website links. */ links: Array /** the pages found */ pages: Array } +export class Website { + constructor(url: string) + /** crawl a website */ + crawl(): Promise + /** scrape a website */ + scrape(): Promise + /** get all the links of a website */ + getLinks(): Array + /** get all the pages of a website */ + getPages(): Array +} diff --git a/index.js b/index.js index 278e2f0..89267d1 100644 --- a/index.js +++ b/index.js @@ -252,7 +252,8 @@ if (!nativeBinding) { throw new Error(`Failed to load native binding`) } -const { Website, crawl } = nativeBinding +const { NWebsite, crawl, Website } = nativeBinding -module.exports.Website = Website +module.exports.NWebsite = NWebsite module.exports.crawl = crawl +module.exports.Website = Website diff --git a/src/lib.rs b/src/lib.rs index 24cd76f..9ca28a5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,8 @@ pub struct Page { } #[napi] -pub struct Website { +/// website main data from rust to node +pub struct NWebsite { /// all of the website links. pub links: Vec, /// the pages found @@ -23,8 +24,8 @@ pub struct Website { #[napi] /// crawl a website gathering all links to array -pub async fn crawl(n: String) -> Website { - let mut website = spider::website::Website::new(&n); +pub async fn crawl(url: String) -> NWebsite { + let mut website = spider::website::Website::new(&url); let mut rx2 = website .subscribe(16) .expect("sync feature should be enabled"); @@ -59,5 +60,64 @@ pub async fn crawl(n: String) -> Website { let links = pages.iter().map(|x| x.url.clone()).collect::>(); - Website { links, pages } + NWebsite { links, pages } +} + +#[napi] +pub struct Website { + /// the website from spider + inner: spider::website::Website, +} + +#[napi] +impl Website { + #[napi(constructor)] + pub fn new(url: String) -> Self { + Website { + inner: spider::website::Website::new(&url), + } + } + #[napi] + /// crawl a website + pub async unsafe fn crawl(&mut self) { + self.inner.crawl().await; + } + + #[napi] + /// scrape a website + pub async unsafe fn scrape(&mut self) { + self.inner.scrape().await; + } + + #[napi] + /// get all the links of a website + pub fn get_links(&self) -> Vec { + let links = self + .inner + .get_links() + .iter() + .map(|x| x.as_ref().to_string()) + .collect::>(); + links + } + + /// get all the pages of a website + #[napi] + pub fn get_pages(&self) -> Vec { + let mut pages: Vec = Vec::new(); + + match self.inner.get_pages() { + Some(p) => { + for page in p.iter() { + pages.push(Page { + url: page.get_url().into(), + content: page.get_html(), + }); + } + } + _ => (), + } + + pages + } }