Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions examples/llm_learner_rwthdbis_taxonomy_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Import core modules from the OntoLearner library
from ontolearner import LearnerPipeline, train_test_split
from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner

# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery
ontology = ChordOntology()
ontology.load() # Read entities, type system, and taxonomic edges into memory

# Extract typed taxonomic edges and split into train/test while preserving the structured shape
train_data, test_data = train_test_split(
ontology.extract(),
test_size=0.2,
random_state=42
)

# Initialize a supervised taxonomy classifier (encoder-based fine-tuning)
# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views
# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions
learner = RWTHDBISTaxonomyLearner(
model_name="microsoft/deberta-v3-small",
output_dir="./results/",
num_train_epochs=1,
per_device_train_batch_size=8,
gradient_accumulation_steps=4,
learning_rate=2e-5,
max_length=256,
seed=42,
negative_ratio=5,
bidirectional_templates=True,
context_json_path=None,
ontology_name=ontology.ontology_full_name,
)

# Build the pipeline
pipeline = LearnerPipeline(
llm=learner,
llm_id=learner.model_name,
ontologizer_data=False,
)

# # Run the full learning pipeline on the taxonomy-discovery task
outputs = pipeline(
train_data=train_data,
test_data=test_data,
task="taxonomy-discovery",
evaluate=True,
ontologizer_data=False,
)

# Display the evaluation results
print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs['elapsed_time'])

# Print all returned outputs (include predictions)
print(outputs)
50 changes: 50 additions & 0 deletions examples/llm_learner_rwthdbis_term_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Import core modules from the OntoLearner library
from ontolearner import LearnerPipeline, train_test_split, AgrO
from ontolearner import RWTHDBISTermTypingLearner

#load the AgrO ontology.
# AgrO provides term-typing supervision where each term can be annotated with one or more types.
ontology = AgrO()
ontology.load()
data = ontology.extract()

# Split the labeled term-typing data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Configure a supervised encoder-based classifier for term typing.
# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results.
learner = RWTHDBISTermTypingLearner(
model_name="microsoft/deberta-v3-small",
output_dir="./results/deberta-v3",
num_train_epochs=30,
per_device_train_batch_size=16,
gradient_accumulation_steps=2,
learning_rate=2e-5,
max_length=64,
seed=42,
)

# Build the pipeline and pass raw structured objects end-to-end.
pipeline = LearnerPipeline(
llm=learner,
llm_id=learner.model_name,
ontologizer_data=False,
)

# Run the full learning pipeline on the term-typing task
outputs = pipeline(
train_data=train_data,
test_data=test_data,
task="term-typing",
evaluate=True,
ontologizer_data=False,
)

# Display the evaluation results
print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs['elapsed_time'])

# Print all returned outputs (include predictions)
print(outputs)
66 changes: 66 additions & 0 deletions examples/llm_learner_sbunlp_fs_taxonomy_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Import core modules from the OntoLearner library
from ontolearner import GeoNames, train_test_split, LearnerPipeline
# Import the specific Few-Shot Learner implementation
from ontolearner import SBUNLPFewShotLearner

# Load ontology and split
# Load the GeoNames ontology for taxonomy discovery.
# GeoNames provides geographic parent-child relationships (is-a hierarchy).
ontology = GeoNames()
ontology.load()
data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object

# Split the taxonomic relationships into train and test sets
train_data, test_data = train_test_split(
data,
test_size=0.6, # 60% of data used for testing (terms to find relations for)
random_state=42,
)

# Configure the learner with user-defined inference args + device
# Configure the SBUNLP Few-Shot Learner using the Qwen model.
# This performs in-context learning via N x M batch prompting.
llm_learner = SBUNLPFewShotLearner(
# Model / decoding
model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load
try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency
max_new_tokens=140, # limit the length of the model's response (for JSON output)
max_input_tokens=1500, # limit the total prompt length (context window)
temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON)
top_p=1.0, # top-p sampling disabled with temperature=0.0

# Grid settings (N x M prompts)
n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks
m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts)

# Run controls
limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run
output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging
)

# Build pipeline and run
# Build the pipeline, passing the Few-Shot Learner.
pipe = LearnerPipeline(
llm=llm_learner,
llm_id=llm_learner.model_name,
ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers
device="auto", # automatically select CUDA or CPU
)

# Run the full learning pipeline on the taxonomy-discovery task
outputs = pipe(
train_data=train_data,
test_data=test_data,
task="taxonomy-discovery",
evaluate=True,
ontologizer_data=True,
)

# Display the evaluation results
print("Metrics:", outputs.get("metrics"))

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs["elapsed_time"])

# Print all returned outputs (include predictions)
print(outputs)
81 changes: 81 additions & 0 deletions examples/llm_learner_sbunlp_text2onto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
import torch
#Import all the required classes
from ontolearner import SBUNLPText2OntoLearner
from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM

# Local folder where the dataset is stored
# This path is relative to the directory where the script is executed
# (e.g., E:\OntoLearner\examples)
LOCAL_DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology"

# Ensure the base directories exist
# Creates the train and test subdirectories if they don't already exist.
os.makedirs(os.path.join(LOCAL_DATA_DIR, 'train'), exist_ok=True)
os.makedirs(os.path.join(LOCAL_DATA_DIR, 'test'), exist_ok=True)

# Define local file paths: POINTING TO ALREADY SAVED FILES
# These files are used as input for the Fit and Predict phases.
DOCS_ALL_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/documents.jsonl"
TERMS2DOC_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/terms2docs.json"
DOCS_TEST_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/text2onto_ecology_test_documents.jsonl"

# Output files for predictions (saved directly under LOCAL_DATA_DIR/test)
# These files will be created by the predict_terms/types methods.
TERMS_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl"
TYPES_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl"

#Initialize and Load Learner ---
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Determine the device for inference (GPU or CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Instantiate the underlying LLM helper
# (LocalAutoLLM handles model loading and generation)
llm_model_helper = LocalAutoLLM(device=DEVICE)

# Instantiate the main learner class, passing the LLM helper to its constructor
learner = SBUNLPText2OntoLearner(model=llm_model_helper, device=DEVICE)

# Load the model (This calls llm_model_helper.load)
LOAD_IN_4BIT = torch.cuda.is_available()
learner.model.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT)

# Build Few-Shot Exemplars (Fit Phase)
# The fit method uses the local data paths to build the in-context learning prompts.
learner.fit(
train_docs_jsonl=DOCS_ALL_PATH,
terms2doc_json=TERMS2DOC_PATH,
sample_size=28,
seed=123 # Seed for stratified random sampling stability
)

MAX_NEW_TOKENS = 100

terms_written = learner.predict_terms(
docs_test_jsonl=DOCS_TEST_PATH,
out_jsonl=TERMS_PRED_OUT,
max_new_tokens=MAX_NEW_TOKENS
)
print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.")

# Type Extraction subtask
types_written = learner.predict_types(
docs_test_jsonl=DOCS_TEST_PATH,
out_jsonl=TYPES_PRED_OUT,
max_new_tokens=MAX_NEW_TOKENS
)
print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.")

try:
# Evaluate Term Extraction using the custom F1 function and gold data
f1_term = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TERMS_PRED_OUT, key="term")
print(f"Final Term Extraction F1: {f1_term:.4f}")

# Evaluate Type Extraction
f1_type = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TYPES_PRED_OUT, key="type")
print(f"Final Type Extraction F1: {f1_type:.4f}")

except Exception as e:
# Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError)
print(f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created.")
55 changes: 55 additions & 0 deletions examples/llm_learner_sbunlp_zs_term_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Import core modules from the OntoLearner library
from ontolearner import AgrO, train_test_split, LearnerPipeline
# Import the specific Zero-Shot Learner implementation for Term Typing
from ontolearner import SBUNLPZSLearner

# Load ontology and split
# Load the AgrO ontology for type inventory and test data.
ontology = AgrO()
ontology.load()
data = ontology.extract() # Extract the full set of relationships/terms

# Split the data into train (to learn type inventory) and test (terms to predict)
train_data, test_data = train_test_split(
data,
test_size=0.6, # 60% of data used for testing
random_state=42,
)

# Configure the Qwen Zero-Shot learner (inference-only)
# This learner's 'fit' phase learns the vocabulary of allowed type labels.
llm_learner = SBUNLPZSLearner(
# Model / decoding
model_id="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load
# device= is auto-detected
max_new_tokens=64, # Sufficient length for JSON list of types
temperature=0.0, # Ensures deterministic (greedy) output
# token= None, # Assuming public model access
)

# Build pipeline and run
# Build the pipeline, passing the Zero-Shot Learner.
pipe = LearnerPipeline(
llm=llm_learner,
llm_id=llm_learner.model_id,
ontologizer_data=False,
device="cpu", # select CUDA or CPU
)

# Run the full learning pipeline on the Term-Typing task
outputs = pipe(
train_data=train_data,
test_data=test_data,
task="term-typing",
evaluate=True,
ontologizer_data=False,
)

# Display the evaluation results
print("Metrics:", outputs.get("metrics"))

# Display total elapsed time for learning (type inventory) + prediction + evaluation
print("Elapsed time:", outputs.get("elapsed_time"))

# Print all returned outputs (include predictions)
print(outputs)
64 changes: 64 additions & 0 deletions examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Import core modules from the OntoLearner library
from ontolearner import GeoNames, train_test_split, LearnerPipeline
from ontolearner import SKHNLPSequentialFTLearner

# Load ontology and split
# Load the GeoNames ontology for taxonomy discovery.
# GeoNames provides geographic parent-child relationships (is-a hierarchy).
ontology = GeoNames()
ontology.load()
data = ontology.extract()

# Split the taxonomic relationships into train and test sets
train_data, test_data = train_test_split(
data,
test_size=0.2,
random_state=42
)

# Configure the learner with user-defined training args + device
# Configure the supervised BERT SFT Learner for taxonomy discovery.
# This fine-tunes BERT-Large using Sequential Prompts on (Parent, Child) pairs.
bert_learner = SKHNLPSequentialFTLearner(
model_name="bert-large-uncased",
n_prompts=2,
random_state=1403,
device="cpu", # Note: CPU training for BERT-Large is very slow.
output_dir="./results/",
num_train_epochs=1,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay=0.01,
logging_dir="./logs/",
logging_steps=50,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)

# Build pipeline and run
# Build the pipeline, passing the BERT Learner.
pipeline = LearnerPipeline(
llm=bert_learner,
llm_id="bert-large-uncased",
ontologizer_data=False,
)

# Run the full learning pipeline on the taxonomy-discovery task
outputs = pipeline(
train_data=train_data,
test_data=test_data,
task="taxonomy-discovery",
evaluate=True,
ontologizer_data=False,
)

# Display the evaluation results
print("Metrics:", outputs.get("metrics"))

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs["elapsed_time"])

# Print all returned outputs (include predictions)
print(outputs)
Loading