Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions examples/llm_learner_alexbek_rag_term_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Import core modules from the OntoLearner library
from ontolearner import GeoNames, train_test_split, LearnerPipeline
from ontolearner import AlexbekRAGLearner

# Load the GeoNames ontology.
ontology = GeoNames()
ontology.load()

# Extract labeled items and split into train/test sets for evaluation
train_data, test_data = train_test_split(ontology.extract(), test_size=0.2, random_state=42)

# Configure a Retrieval-Augmented Generation (RAG) term-typing classifier.
# - llm_model_id: generator used to predict types from the prompt + retrieved examples
# - retriever_model_id: encoder used to embed items and fetch top-k similar (RAG) examples
# - device: "cuda" for GPU or "cpu"
# - top_k: number of nearest examples to retrieve per query term
# - max_new_tokens: decoding budget of the LLM during prediction
# - output_dir: where intermediate artifacts / logs can be stored
rag_learner = AlexbekRAGLearner(
llm_model_id="Qwen/Qwen2.5-0.5B-Instruct",
retriever_model_id="sentence-transformers/all-MiniLM-L6-v2",
device="cuda",
top_k=3,
max_new_tokens=256,
output_dir="./results/",
)

# Build the pipeline and pass raw structured objects end-to-end.
# We place the RAG learner in the llm slot and set llm_id accordingly.
pipe = LearnerPipeline(
llm=rag_learner,
llm_id="Qwen/Qwen2.5-0.5B-Instruct",
ontologizer_data=True,
)

# Run the full learning pipeline on the term-typing task
# - task="term-typing" (Task B)
# - evaluate=True computes precision/recall/F1 on the held-out test split
# - ontologizer_data=True must match the pipeline flag above
outputs = pipe(
train_data=train_data,
test_data=test_data,
task="term-typing",
evaluate=True,
ontologizer_data=True,
)

# Display the evaluation results and runtime
print("Metrics:", outputs.get("metrics")) # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...}
print("Elapsed time (s):", outputs.get("elapsed_time"))
54 changes: 54 additions & 0 deletions examples/llm_learner_alexbek_rf_term_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Import core modules from the OntoLearner library
from ontolearner import GeoNames, train_test_split, LearnerPipeline
from ontolearner import AlexbekRFLearner # A random-forest term-typing learner over text+graph features

# Load the GeoNames ontology and extract labeled term-typing data

ontology = GeoNames()
ontology.load()

data = ontology.extract()

# Split the labeled term-typing data into train and test sets
train_data, test_data = train_test_split(
data,
test_size=0.2,
random_state=42
)

# Configure the RF-based learner (embeddings + optional graph features)
# - device: "cpu" or "cuda"
# - threshold: decision threshold for multi-label assignment
# - use_graph_features: include ontology-graph-derived features if available
rf_learner = AlexbekRFLearner(
device="cpu", # switch to "cuda" if you have a GPU
batch_size=16,
max_length=512, # max tokenizer length for embedding model inputs
threshold=0.30, # probability cutoff for assigning each type
use_graph_features=True # set False for pure RF on text embeddings only
)

# Build the pipeline and pass raw structured objects end-to-end.
pipe = LearnerPipeline(
retriever=rf_learner,
retriever_id="intfloat/e5-base-v2", # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory
ontologizer_data=True, # True if data is already {"term": ..., "types": [...], ...}
device="cpu",
batch_size=16
)

# Run the full learning pipeline on the term-typing task
outputs = pipe(
train_data=train_data,
test_data=test_data,
task="term-typing",
evaluate=True,
ontologizer_data=True,
)

# Display evaluation summary and runtime
print("Metrics:", outputs.get("metrics"))

print("Elapsed time:", outputs["elapsed_time"])

print(ontology)
41 changes: 41 additions & 0 deletions examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from ontolearner import GeoNames, train_test_split, LearnerPipeline
from ontolearner import AlexbekCrossAttnLearner
# 1) Load & split
ontology = GeoNames()
ontology.load()
data = ontology.extract()
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# 2) Configure the cross-attention learner
cross_learner = AlexbekCrossAttnLearner(
embedding_model="sentence-transformers/all-MiniLM-L6-v2", # or "Qwen/Qwen2.5-1.5B-... (if wrapped as ST)"
device="cpu",
num_heads=8,
lr=5e-5,
weight_decay=0.01,
num_epochs=1,
batch_size=256,
neg_ratio=1.0,
output_dir="./results/crossattn/",
seed=42,
)

# 3) Build pipeline
pipeline = LearnerPipeline(
llm=cross_learner, # <- our learner
llm_id="cross-attn", # label for bookkeeping
ontologizer_data=False # pass raw ontology objects as in your example
)

# 4) Train + predict + evaluate
outputs = pipeline(
train_data=train_data,
test_data=test_data,
task="taxonomy-discovery",
evaluate=True,
ontologizer_data=False,
)

print("Metrics:", outputs.get("metrics"))
print("Elapsed time:", outputs["elapsed_time"])
print(outputs)
74 changes: 74 additions & 0 deletions examples/llm_learner_alexbek_text2onto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import os
import json
import torch

# LocalAutoLLM handles model loading/generation; AlexbekFewShotLearner provides fit/predict APIs
from ontolearner.learner.text2onto.alexbek import LocalAutoLLM, AlexbekFewShotLearner

# Local folder where the dataset is stored (relative to this script)
DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology"

# Input paths (already saved)
TRAIN_DOCS_PATH = os.path.join(DATA_DIR, "train", "documents.jsonl")
TRAIN_TERMS2DOCS_PATH = os.path.join(DATA_DIR, "train", "terms2docs.json")
TEST_DOCS_FULL_PATH = os.path.join(DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl")

# Output paths
DOC_TERMS_OUT_PATH = os.path.join(DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl")
TERMS2TYPES_OUT_PATH = os.path.join(DATA_DIR, "test", "terms2types_pred_ecology.fast.json")
TYPES2DOCS_OUT_PATH = os.path.join(DATA_DIR, "test", "types2docs_pred_ecology.fast.json")

# Device selection
DEVICE = (
"cuda"
if torch.cuda.is_available()
else ("mps" if torch.backends.mps.is_available() else "cpu")
)

# Model config
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
LOAD_IN_4BIT = (DEVICE == "cuda") # 4-bit helps on GPU

# 1) Load LLM
llm = LocalAutoLLM(device=DEVICE)
llm.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT)

# 2) Build few-shot exemplars from training split
learner = AlexbekFewShotLearner(model=llm, device=DEVICE)
learner.fit(
train_docs_jsonl=TRAIN_DOCS_PATH,
terms2doc_json=TRAIN_TERMS2DOCS_PATH,
# use defaults for sample size/seed
)

# 3) Predict terms per test document
os.makedirs(os.path.dirname(DOC_TERMS_OUT_PATH), exist_ok=True)
num_written_doc_terms = learner.predict_terms(
docs_test_jsonl=TEST_DOCS_FULL_PATH,
out_jsonl=DOC_TERMS_OUT_PATH,
# use defaults for max_new_tokens and few_shot_k
)
print(f"[terms] wrote {num_written_doc_terms} lines → {DOC_TERMS_OUT_PATH}")

# 4) Predict types for extracted terms, using the JSONL we just wrote
typing_summary = learner.predict_types_from_terms(
doc_terms_jsonl=DOC_TERMS_OUT_PATH, # read the predictions directly
doc_terms_list=None, # (not needed when doc_terms_jsonl is provided)
model_id=MODEL_ID, # reuse the same small model
out_terms2types=TERMS2TYPES_OUT_PATH,
out_types2docs=TYPES2DOCS_OUT_PATH,
# use defaults for everything else
)

print(f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types")
print(f"[saved] {TERMS2TYPES_OUT_PATH}")
print(f"[saved] {TYPES2DOCS_OUT_PATH}")

# 5) Small preview of term→types
try:
with open(TERMS2TYPES_OUT_PATH, "r", encoding="utf-8") as fin:
preview = json.load(fin)[:3]
print("[preview] first 3:")
print(json.dumps(preview, ensure_ascii=False, indent=2))
except Exception as e:
print(f"[preview] skipped: {e}")
57 changes: 57 additions & 0 deletions examples/llm_learner_rwthdbis_taxonomy_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Import core modules from the OntoLearner library
from ontolearner import LearnerPipeline, train_test_split
from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner

# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery
ontology = ChordOntology()
ontology.load() # Read entities, type system, and taxonomic edges into memory

# Extract typed taxonomic edges and split into train/test while preserving the structured shape
train_data, test_data = train_test_split(
ontology.extract(),
test_size=0.2,
random_state=42
)

# Initialize a supervised taxonomy classifier (encoder-based fine-tuning)
# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views
# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions
learner = RWTHDBISTaxonomyLearner(
model_name="microsoft/deberta-v3-small",
output_dir="./results/",
num_train_epochs=1,
per_device_train_batch_size=8,
gradient_accumulation_steps=4,
learning_rate=2e-5,
max_length=256,
seed=42,
negative_ratio=5,
bidirectional_templates=True,
context_json_path=None,
ontology_name=ontology.ontology_full_name,
)

# Build the pipeline
pipeline = LearnerPipeline(
llm=learner,
llm_id=learner.model_name,
ontologizer_data=False,
)

# # Run the full learning pipeline on the taxonomy-discovery task
outputs = pipeline(
train_data=train_data,
test_data=test_data,
task="taxonomy-discovery",
evaluate=True,
ontologizer_data=False,
)

# Display the evaluation results
print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs['elapsed_time'])

# Print all returned outputs (include predictions)
print(outputs)
50 changes: 50 additions & 0 deletions examples/llm_learner_rwthdbis_term_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Import core modules from the OntoLearner library
from ontolearner import LearnerPipeline, train_test_split, AgrO
from ontolearner import RWTHDBISTermTypingLearner

#load the AgrO ontology.
# AgrO provides term-typing supervision where each term can be annotated with one or more types.
ontology = AgrO()
ontology.load()
data = ontology.extract()

# Split the labeled term-typing data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Configure a supervised encoder-based classifier for term typing.
# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results.
learner = RWTHDBISTermTypingLearner(
model_name="microsoft/deberta-v3-small",
output_dir="./results/deberta-v3",
num_train_epochs=30,
per_device_train_batch_size=16,
gradient_accumulation_steps=2,
learning_rate=2e-5,
max_length=64,
seed=42,
)

# Build the pipeline and pass raw structured objects end-to-end.
pipeline = LearnerPipeline(
llm=learner,
llm_id=learner.model_name,
ontologizer_data=False,
)

# Run the full learning pipeline on the term-typing task
outputs = pipeline(
train_data=train_data,
test_data=test_data,
task="term-typing",
evaluate=True,
ontologizer_data=False,
)

# Display the evaluation results
print("Metrics:", outputs['metrics']) # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs['elapsed_time'])

# Print all returned outputs (include predictions)
print(outputs)
66 changes: 66 additions & 0 deletions examples/llm_learner_sbunlp_fs_taxonomy_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Import core modules from the OntoLearner library
from ontolearner import GeoNames, train_test_split, LearnerPipeline
# Import the specific Few-Shot Learner implementation
from ontolearner import SBUNLPFewShotLearner

# Load ontology and split
# Load the GeoNames ontology for taxonomy discovery.
# GeoNames provides geographic parent-child relationships (is-a hierarchy).
ontology = GeoNames()
ontology.load()
data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object

# Split the taxonomic relationships into train and test sets
train_data, test_data = train_test_split(
data,
test_size=0.6, # 60% of data used for testing (terms to find relations for)
random_state=42,
)

# Configure the learner with user-defined inference args + device
# Configure the SBUNLP Few-Shot Learner using the Qwen model.
# This performs in-context learning via N x M batch prompting.
llm_learner = SBUNLPFewShotLearner(
# Model / decoding
model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load
try_4bit=True, # uses 4-bit if bitsandbytes + CUDA available for memory efficiency
max_new_tokens=140, # limit the length of the model's response (for JSON output)
max_input_tokens=1500, # limit the total prompt length (context window)
temperature=0.0, # set to 0.0 for deterministic output (best for structured JSON)
top_p=1.0, # top-p sampling disabled with temperature=0.0

# Grid settings (N x M prompts)
n_train_chunks=7, # N: split training examples (few-shot context) into 7 chunks
m_test_chunks=7, # M: split test terms (vocabulary) into 7 chunks (total 49 prompts)

# Run controls
limit_prompts=None, # None runs all N x M prompts; set to an integer for a dry-run
output_dir="./outputs/taskC_batches", # Optional: dump per-prompt JSON results for debugging
)

# Build pipeline and run
# Build the pipeline, passing the Few-Shot Learner.
pipe = LearnerPipeline(
llm=llm_learner,
llm_id=llm_learner.model_name,
ontologizer_data=True, # Let the learner flatten structured ontology objects via its tasks_* helpers
device="auto", # automatically select CUDA or CPU
)

# Run the full learning pipeline on the taxonomy-discovery task
outputs = pipe(
train_data=train_data,
test_data=test_data,
task="taxonomy-discovery",
evaluate=True,
ontologizer_data=True,
)

# Display the evaluation results
print("Metrics:", outputs.get("metrics"))

# Display total elapsed time for training + prediction + evaluation
print("Elapsed time:", outputs["elapsed_time"])

# Print all returned outputs (include predictions)
print(outputs)
Loading