Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the libre textbooks #149

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions data/libre_textbooks/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
---
name: libre_textbooks
description: A dataset of scraped articles from libre textbooks
targets:
- id: html
description: A scraped page from libre textbooks
units:
type: text
names:
- natural language article
pubchem_aids: []
uris: []
identifiers:
- id: url
type: string
description: url of the page the content is scraped from
- id: text_length
type: int
description: text character count
Comment on lines +17 to +19
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- id: text_length
type: int
description: text character count

license: CC BY 4.0
links:
- name: Libre Textbooks
url: https://chem.libretexts.org/Bookshelves
description: ''
- name: Hugging Face dataset upload
url: https://huggingface.co/datasets/Hack90/libre_chem_textbooks
description: Hugging Face dataset uploaded to HF account
benchmarks: []
num_points: 3740
bibtex: []
76 changes: 76 additions & 0 deletions data/libre_textbooks/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from datasets import load_dataset
import pandas as pd
import yaml


LINES_TO_REMOVE = "/workspaces/chemnlp/data/libre_textbooks/lines_to_remove.jsonl"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not used below. Are those lines already removed on the HF dataset upload?

RAW_DATASET = "Hack90/libre_chem_textbooks"


META_YAML_PATH = "./data/libre_textbooks/meta.yaml"
META_TEMPLATE = {
"name": "libre_textbooks", # unique identifier, we will also use this for directory names
"description": "A dataset of scraped articles from libre textbooks",
"targets": [
{
"id": "html", # name of the column in a tabular dataset
"description": "A scraped page from libre textbooks",
"units": None, # units of the values in this column (leave empty if unitless)
"type": "string", # can be "categorical", "ordinal", "continuous", "string"
Copy link
Collaborator

@kjappelbaum kjappelbaum May 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"type": "string", # can be "categorical", "ordinal", "continuous", "string"
"type": "text", # can be "categorical", "ordinal", "continuous", "text"

"names": [ # names for the property (to sample from for building the prompts)
"natural language article",
],
"pubchem_aids": [],
"uris": [],
},
],
"identifiers": [
{
"id": "url", # column name
"type": "Other", # can be "SMILES", "SELFIES", "IUPAC", "OTHER"
"description": "url of the page the content is scraped from",
},
],
"license": "CC BY 4.0", # license under which the original dataset was published
"links": [ # list of relevant links (original dataset, other uses, etc.)
{
"name": "Libre Textbooks",
"url": "https://chem.libretexts.org/Bookshelves",
"description": "",
},
{
"name": "Hugging Face dataset upload",
"url": "https://huggingface.co/datasets/Hack90/libre_chem_textbooks",
"description": "Hugging Face dataset uploaded to HF account", # Hopefully will move this
# to the openbioml space
},
],
"benchmarks": [],
"num_points": 3740, # number of datapoints in this dataset
"bibtex": [
# noqa
],
}


def get_raw_data(raw_dataset: str = RAW_DATASET) -> pd.DataFrame:
"""Load the raw dataset into a pandas dataframe"""
dataset = load_dataset(raw_dataset)
df_raw = pd.DataFrame(dataset["train"].to_pandas())
return df_raw


def create_meta_yaml(num_points: int):
"""Create meta configuration file for the dataset"""
# create meta yaml
META_TEMPLATE["num_points"] = num_points
with open(META_YAML_PATH, "w+") as f:
yaml.dump(META_TEMPLATE, f, sort_keys=False)
print(f"Finished processing libre_textbooks {META_TEMPLATE['name']} dataset!")


if __name__ == "__main__":
num_samples = 0
raw_df = get_raw_data()
num_samples += len(raw_df)
create_meta_yaml(num_samples)
Loading