sciknoworg · Krishna-Rani-t · Oct 22, 2025 · Oct 29, 2025 · Oct 29, 2025 · Nov 3, 2025
diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py
@@ -0,0 +1,57 @@
+# Import core modules from the OntoLearner library
+from ontolearner import LearnerPipeline, train_test_split
+from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner
+
+# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery
+ontology = ChordOntology()
+ontology.load()  # Read entities, type system, and taxonomic edges into memory
+
+# Extract typed taxonomic edges and split into train/test while preserving the structured shape
+train_data, test_data = train_test_split(
+    ontology.extract(),
+    test_size=0.2,
+    random_state=42
+)
+
+# Initialize a supervised taxonomy classifier (encoder-based fine-tuning)
+# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views
+# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions
+learner = RWTHDBISTaxonomyLearner(
+    model_name="microsoft/deberta-v3-small",
+    output_dir="./results/",
+    num_train_epochs=1,
+    per_device_train_batch_size=8,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-5,
+    max_length=256,
+    seed=42,
+    negative_ratio=5,
+    bidirectional_templates=True,
+    context_json_path=None,
+    ontology_name=ontology.ontology_full_name,
+)
+
+# Build the pipeline
+pipeline = LearnerPipeline(
+    llm=learner,
+    llm_id=learner.model_name,
+    ontologizer_data=False,
+)
+
+# # Run the full learning pipeline on the taxonomy-discovery task
+outputs = pipeline(
+    train_data=train_data,
+    test_data=test_data,
+    task="taxonomy-discovery",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs['metrics'])          # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs['elapsed_time'])
+
+# Print all returned outputs (include predictions)
+print(outputs)
diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py
@@ -0,0 +1,50 @@
+# Import core modules from the OntoLearner library
+from ontolearner import LearnerPipeline, train_test_split, AgrO
+from ontolearner import RWTHDBISTermTypingLearner
+
+#load the AgrO ontology.
+# AgrO provides term-typing supervision where each term can be annotated with one or more types.
+ontology = AgrO()
+ontology.load()
+data = ontology.extract()
+
+# Split the labeled term-typing data into train and test sets
+train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
+
+# Configure a supervised encoder-based classifier for term typing.
+# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results.
+learner = RWTHDBISTermTypingLearner(
+    model_name="microsoft/deberta-v3-small",
+    output_dir="./results/deberta-v3",
+    num_train_epochs=30,
+    per_device_train_batch_size=16,
+    gradient_accumulation_steps=2,
+    learning_rate=2e-5,
+    max_length=64,
+    seed=42,
+)
+
+# Build the pipeline and pass raw structured objects end-to-end.
+pipeline = LearnerPipeline(
+    llm=learner,
+    llm_id=learner.model_name,
+    ontologizer_data=False,
+)
+
+# Run the full learning pipeline on the term-typing task
+outputs = pipeline(
+    train_data=train_data,
+    test_data=test_data,
+    task="term-typing",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs['metrics'])          # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs['elapsed_time'])
+
+# Print all returned outputs (include predictions)
+print(outputs)
diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py
@@ -0,0 +1,66 @@
+# Import core modules from the OntoLearner library
+from ontolearner import GeoNames, train_test_split, LearnerPipeline
+# Import the specific Few-Shot Learner implementation
+from ontolearner import SBUNLPFewShotLearner
+
+# Load ontology and split
+# Load the GeoNames ontology for taxonomy discovery.
+# GeoNames provides geographic parent-child relationships (is-a hierarchy).
+ontology = GeoNames()
+ontology.load()
+data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object
+
+# Split the taxonomic relationships into train and test sets
+train_data, test_data = train_test_split(
+    data,
+    test_size=0.6, # 60% of data used for testing (terms to find relations for)
+    random_state=42,
+)
+
+# Configure the learner with user-defined inference args + device
+# Configure the SBUNLP Few-Shot Learner using the Qwen model.
+# This performs in-context learning via N x M batch prompting.
+llm_learner = SBUNLPFewShotLearner(
+    # Model / decoding
+    model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load
+    try_4bit=True,              # uses 4-bit if bitsandbytes + CUDA available for memory efficiency
+    max_new_tokens=140,         # limit the length of the model's response (for JSON output)
+    max_input_tokens=1500,      # limit the total prompt length (context window)
+    temperature=0.0,            # set to 0.0 for deterministic output (best for structured JSON)
+    top_p=1.0,                  # top-p sampling disabled with temperature=0.0
+
+    # Grid settings (N x M prompts)
+    n_train_chunks=7,           # N: split training examples (few-shot context) into 7 chunks
+    m_test_chunks=7,            # M: split test terms (vocabulary) into 7 chunks (total 49 prompts)
+
+    # Run controls
+    limit_prompts=None,         # None runs all N x M prompts; set to an integer for a dry-run
+    output_dir="./outputs/taskC_batches",  # Optional: dump per-prompt JSON results for debugging
+)
+
+# Build pipeline and run
+# Build the pipeline, passing the Few-Shot Learner.
+pipe = LearnerPipeline(
+    llm=llm_learner,
+    llm_id=llm_learner.model_name,
+    ontologizer_data=True,      # Let the learner flatten structured ontology objects via its tasks_* helpers
+    device="auto",              # automatically select CUDA or CPU
+)
+
+# Run the full learning pipeline on the taxonomy-discovery task
+outputs = pipe(
+    train_data=train_data,
+    test_data=test_data,
+    task="taxonomy-discovery",
+    evaluate=True,
+    ontologizer_data=True,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs.get("metrics"))
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs["elapsed_time"])
+
+# Print all returned outputs (include predictions)
+print(outputs)
diff --git a/examples/llm_learner_sbunlp_text2onto.py b/examples/llm_learner_sbunlp_text2onto.py
@@ -0,0 +1,81 @@
+import os
+import torch
+#Import all the required classes
+from ontolearner import SBUNLPText2OntoLearner
+from ontolearner.learner.text2onto.sbunlp import LocalAutoLLM
+
+# Local folder where the dataset is stored
+# This path is relative to the directory where the script is executed
+# (e.g., E:\OntoLearner\examples)
+LOCAL_DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology"
+
+# Ensure the base directories exist
+# Creates the train and test subdirectories if they don't already exist.
+os.makedirs(os.path.join(LOCAL_DATA_DIR, 'train'), exist_ok=True)
+os.makedirs(os.path.join(LOCAL_DATA_DIR, 'test'), exist_ok=True)
+
+# Define local file paths: POINTING TO ALREADY SAVED FILES
+# These files are used as input for the Fit and Predict phases.
+DOCS_ALL_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/documents.jsonl"
+TERMS2DOC_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/train/terms2docs.json"
+DOCS_TEST_PATH = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/text2onto_ecology_test_documents.jsonl"
+
+# Output files for predictions (saved directly under LOCAL_DATA_DIR/test)
+# These files will be created by the predict_terms/types methods.
+TERMS_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_terms_ecology.jsonl"
+TYPES_PRED_OUT = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology/test/extracted_types_ecology.jsonl"
+
+#Initialize and Load Learner ---
+MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# Determine the device for inference (GPU or CPU)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Instantiate the underlying LLM helper
+# (LocalAutoLLM handles model loading and generation)
+llm_model_helper = LocalAutoLLM(device=DEVICE)
+
+# Instantiate the main learner class, passing the LLM helper to its constructor
+learner = SBUNLPText2OntoLearner(model=llm_model_helper, device=DEVICE)
+
+# Load the model (This calls llm_model_helper.load)
+LOAD_IN_4BIT = torch.cuda.is_available()
+learner.model.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT)
+
+# Build Few-Shot Exemplars (Fit Phase)
+# The fit method uses the local data paths to build the in-context learning prompts.
+learner.fit(
+    train_docs_jsonl=DOCS_ALL_PATH,
+    terms2doc_json=TERMS2DOC_PATH,
+    sample_size=28,
+    seed=123 # Seed for stratified random sampling stability
+)
+
+MAX_NEW_TOKENS = 100
+
+terms_written = learner.predict_terms(
+    docs_test_jsonl=DOCS_TEST_PATH,
+    out_jsonl=TERMS_PRED_OUT,
+    max_new_tokens=MAX_NEW_TOKENS
+)
+print(f"✅ Term Extraction Complete. Wrote {terms_written} prediction lines.")
+
+# Type Extraction subtask
+types_written = learner.predict_types(
+    docs_test_jsonl=DOCS_TEST_PATH,
+    out_jsonl=TYPES_PRED_OUT,
+    max_new_tokens=MAX_NEW_TOKENS
+)
+print(f"✅ Type Extraction Complete. Wrote {types_written} prediction lines.")
+
+try:
+    # Evaluate Term Extraction using the custom F1 function and gold data
+    f1_term = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TERMS_PRED_OUT, key="term")
+    print(f"Final Term Extraction F1: {f1_term:.4f}")
+
+    # Evaluate Type Extraction
+    f1_type = learner.evaluate_extraction_f1(TERMS2DOC_PATH, TYPES_PRED_OUT, key="type")
+    print(f"Final Type Extraction F1: {f1_type:.4f}")
+
+except Exception as e:
+     # Catches errors like missing sklearn (ImportError) or missing prediction files (FileNotFoundError)
+     print(f"❌ Evaluation Error: {e}. Ensure sklearn is installed and prediction files were created.")
diff --git a/examples/llm_learner_sbunlp_zs_term_typing.py b/examples/llm_learner_sbunlp_zs_term_typing.py
@@ -0,0 +1,55 @@
+# Import core modules from the OntoLearner library
+from ontolearner import AgrO, train_test_split, LearnerPipeline
+# Import the specific Zero-Shot Learner implementation for Term Typing
+from ontolearner import SBUNLPZSLearner
+
+# Load ontology and split
+# Load the AgrO ontology for type inventory and test data.
+ontology = AgrO()
+ontology.load()
+data = ontology.extract() # Extract the full set of relationships/terms
+
+# Split the data into train (to learn type inventory) and test (terms to predict)
+train_data, test_data = train_test_split(
+    data,
+    test_size=0.6, # 60% of data used for testing
+    random_state=42,
+)
+
+# Configure the Qwen Zero-Shot learner (inference-only)
+# This learner's 'fit' phase learns the vocabulary of allowed type labels.
+llm_learner = SBUNLPZSLearner(
+    # Model / decoding
+    model_id="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load
+    # device= is auto-detected
+    max_new_tokens=64,         # Sufficient length for JSON list of types
+    temperature=0.0,           # Ensures deterministic (greedy) output
+    # token= None,             # Assuming public model access
+)
+
+# Build pipeline and run
+# Build the pipeline, passing the Zero-Shot Learner.
+pipe = LearnerPipeline(
+    llm=llm_learner,
+    llm_id=llm_learner.model_id,
+    ontologizer_data=False,
+    device="cpu",             #  select CUDA or CPU
+)
+
+# Run the full learning pipeline on the Term-Typing task
+outputs = pipe(
+    train_data=train_data,
+    test_data=test_data,
+    task="term-typing",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs.get("metrics"))
+
+# Display total elapsed time for learning (type inventory) + prediction + evaluation
+print("Elapsed time:", outputs.get("elapsed_time"))
+
+# Print all returned outputs (include predictions)
+print(outputs)
diff --git a/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py b/examples/llm_learner_skhnlp_sft_taxonomoy_discovery.py
@@ -0,0 +1,64 @@
+# Import core modules from the OntoLearner library
+from ontolearner import GeoNames, train_test_split, LearnerPipeline
+from ontolearner import SKHNLPSequentialFTLearner
+
+# Load ontology and split
+# Load the GeoNames ontology for taxonomy discovery.
+# GeoNames provides geographic parent-child relationships (is-a hierarchy).
+ontology = GeoNames()
+ontology.load()
+data = ontology.extract()
+
+# Split the taxonomic relationships into train and test sets
+train_data, test_data = train_test_split(
+    data,
+    test_size=0.2,
+    random_state=42
+)
+
+# Configure the learner with user-defined training args + device
+# Configure the supervised BERT SFT Learner for taxonomy discovery.
+# This fine-tunes BERT-Large using Sequential Prompts on (Parent, Child) pairs.
+bert_learner = SKHNLPSequentialFTLearner(
+    model_name="bert-large-uncased",
+    n_prompts=2,
+    random_state=1403,
+    device="cpu", # Note: CPU training for BERT-Large is very slow.
+    output_dir="./results/",
+    num_train_epochs=1,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    warmup_steps=500,
+    weight_decay=0.01,
+    logging_dir="./logs/",
+    logging_steps=50,
+    eval_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+)
+
+# Build pipeline and run
+# Build the pipeline, passing the BERT Learner.
+pipeline = LearnerPipeline(
+    llm=bert_learner,
+    llm_id="bert-large-uncased",
+    ontologizer_data=False,
+)
+
+# Run the full learning pipeline on the taxonomy-discovery task
+outputs = pipeline(
+    train_data=train_data,
+    test_data=test_data,
+    task="taxonomy-discovery",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs.get("metrics"))
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs["elapsed_time"])
+
+# Print all returned outputs (include predictions)
+print(outputs)