Skip to content

Commit

Permalink
chore(lib): add website class
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Nov 27, 2023
1 parent 9dd7620 commit 2043c6e
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 22 deletions.
35 changes: 23 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,35 @@ The [spider](https://github.com/spider-rs/spider) project ported to nodejs via n
1. `npm i @spider-rs/spider-rs --save`

```ts
import { crawl } from '@spider-rs/spider-rs'
import { Website } from '@spider-rs/spider-rs'

// gather all the links found in a website fast concurrently.
const { links, pages } = await crawl("https://rsseau.fr");
const website = new Website("https://rsseau.fr");
await website.crawl();
console.log(website.getLinks())
```

## Development
Collect the resource.

Install the napi cli `npm i @napi-rs/cli --global`.
```ts
import { Website } from '@spider-rs/spider-rs'

1. `yarn build:test`
const website = new Website("https://rsseau.fr");
await website.scrape();
console.log(website.getPages())
```

### TODO: Full Spider Port
Use the crawl shortcut to get the page content and url.

Port the modules to be used via nodejs to adhere spider interface.
```ts
import { crawl } from '@spider-rs/spider-rs'

A full port would require FromNapi support on the following modules.
const { links, pages } = new crawl("https://rsseau.fr");
console.log(pages)
```


## Development

Install the napi cli `npm i @napi-rs/cli --global`.

- compact_str
- case_insensitive_str
- small_vec
1. `yarn build:test`
18 changes: 16 additions & 2 deletions __test__/index.spec.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
import test from 'ava'

import { crawl } from '../index.js'
import { crawl, Website } from '../index.js'

test('crawl native', async (t) => {
const { links, pages } = await crawl("https://rsseau.fr");

t.assert(links.length > 1, "should be more than one link")
t.assert(pages.length > 1, "should be more than one page")
})
})

test('new website native', async (t) => {
const website = new Website("https://rsseau.fr");
await website.crawl();

t.assert(website.getLinks().length > 1, "should be more than one link")
})

test('new website scrape native', async (t) => {
const website = new Website("https://rsseau.fr");
await website.scrape();

t.assert(website.getPages().length > 1, "should be more than one page")
})
16 changes: 14 additions & 2 deletions index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,22 @@ export interface Page {
content: string
}
/** crawl a website gathering all links to array */
export function crawl(n: string): Promise<Website>
export class Website {
export function crawl(url: string): Promise<NWebsite>
/** website main data from rust to node */
export class NWebsite {
/** all of the website links. */
links: Array<string>
/** the pages found */
pages: Array<Page>
}
export class Website {
constructor(url: string)
/** crawl a website */
crawl(): Promise<void>
/** scrape a website */
scrape(): Promise<void>
/** get all the links of a website */
getLinks(): Array<string>
/** get all the pages of a website */
getPages(): Array<Page>
}
5 changes: 3 additions & 2 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,8 @@ if (!nativeBinding) {
throw new Error(`Failed to load native binding`)
}

const { Website, crawl } = nativeBinding
const { NWebsite, crawl, Website } = nativeBinding

module.exports.Website = Website
module.exports.NWebsite = NWebsite
module.exports.crawl = crawl
module.exports.Website = Website
68 changes: 64 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ pub struct Page {
}

#[napi]
pub struct Website {
/// website main data from rust to node
pub struct NWebsite {
/// all of the website links.
pub links: Vec<String>,
/// the pages found
Expand All @@ -23,8 +24,8 @@ pub struct Website {

#[napi]
/// crawl a website gathering all links to array
pub async fn crawl(n: String) -> Website {
let mut website = spider::website::Website::new(&n);
pub async fn crawl(url: String) -> NWebsite {
let mut website = spider::website::Website::new(&url);
let mut rx2 = website
.subscribe(16)
.expect("sync feature should be enabled");
Expand Down Expand Up @@ -59,5 +60,64 @@ pub async fn crawl(n: String) -> Website {

let links = pages.iter().map(|x| x.url.clone()).collect::<Vec<String>>();

Website { links, pages }
NWebsite { links, pages }
}

#[napi]
pub struct Website {
/// the website from spider
inner: spider::website::Website,
}

#[napi]
impl Website {
#[napi(constructor)]
pub fn new(url: String) -> Self {
Website {
inner: spider::website::Website::new(&url),
}
}
#[napi]
/// crawl a website
pub async unsafe fn crawl(&mut self) {
self.inner.crawl().await;
}

#[napi]
/// scrape a website
pub async unsafe fn scrape(&mut self) {
self.inner.scrape().await;
}

#[napi]
/// get all the links of a website
pub fn get_links(&self) -> Vec<String> {
let links = self
.inner
.get_links()
.iter()
.map(|x| x.as_ref().to_string())
.collect::<Vec<String>>();
links
}

/// get all the pages of a website
#[napi]
pub fn get_pages(&self) -> Vec<Page> {
let mut pages: Vec<Page> = Vec::new();

match self.inner.get_pages() {
Some(p) => {
for page in p.iter() {
pages.push(Page {
url: page.get_url().into(),
content: page.get_html(),
});
}
}
_ => (),
}

pages
}
}

0 comments on commit 2043c6e

Please sign in to comment.