sciknoworg · Krishna-Rani-t · Oct 22, 2025 · Oct 29, 2025 · Oct 29, 2025 · Nov 3, 2025
diff --git a/examples/llm_learner_alexbek_rag_term_typing.py b/examples/llm_learner_alexbek_rag_term_typing.py
@@ -0,0 +1,50 @@
+# Import core modules from the OntoLearner library
+from ontolearner import GeoNames, train_test_split, LearnerPipeline
+from ontolearner import AlexbekRAGLearner
+
+# Load the GeoNames ontology.
+ontology = GeoNames()
+ontology.load()
+
+# Extract labeled items and split into train/test sets for evaluation
+train_data, test_data = train_test_split(ontology.extract(), test_size=0.2, random_state=42)
+
+# Configure a Retrieval-Augmented Generation (RAG) term-typing classifier.
+# - llm_model_id: generator used to predict types from the prompt + retrieved examples
+# - retriever_model_id: encoder used to embed items and fetch top-k similar (RAG) examples
+# - device: "cuda" for GPU or "cpu"
+# - top_k: number of nearest examples to retrieve per query term
+# - max_new_tokens: decoding budget of the LLM during prediction
+# - output_dir: where intermediate artifacts / logs can be stored
+rag_learner = AlexbekRAGLearner(
+    llm_model_id="Qwen/Qwen2.5-0.5B-Instruct",
+    retriever_model_id="sentence-transformers/all-MiniLM-L6-v2",
+    device="cuda",
+    top_k=3,
+    max_new_tokens=256,
+    output_dir="./results/",
+)
+
+# Build the pipeline and pass raw structured objects end-to-end.
+# We place the RAG learner in the llm slot and set llm_id accordingly.
+pipe = LearnerPipeline(
+    llm=rag_learner,
+    llm_id="Qwen/Qwen2.5-0.5B-Instruct",
+    ontologizer_data=True,
+)
+
+# Run the full learning pipeline on the term-typing task
+# - task="term-typing" (Task B)
+# - evaluate=True computes precision/recall/F1 on the held-out test split
+# - ontologizer_data=True must match the pipeline flag above
+outputs = pipe(
+    train_data=train_data,
+    test_data=test_data,
+    task="term-typing",
+    evaluate=True,
+    ontologizer_data=True,
+)
+
+# Display the evaluation results and runtime
+print("Metrics:", outputs.get("metrics"))          # e.g., {'precision': ..., 'recall': ..., 'f1_micro': ..., ...}
+print("Elapsed time (s):", outputs.get("elapsed_time"))
diff --git a/examples/llm_learner_alexbek_rf_term_typing.py b/examples/llm_learner_alexbek_rf_term_typing.py
@@ -0,0 +1,54 @@
+# Import core modules from the OntoLearner library
+from ontolearner import GeoNames, train_test_split, LearnerPipeline
+from ontolearner import AlexbekRFLearner   # A random-forest term-typing learner over text+graph features
+
+# Load the GeoNames ontology and extract labeled term-typing data
+
+ontology = GeoNames()
+ontology.load()
+
+data = ontology.extract()
+
+# Split the labeled term-typing data into train and test sets
+train_data, test_data = train_test_split(
+    data,
+    test_size=0.2,
+    random_state=42
+)
+
+# Configure the RF-based learner (embeddings + optional graph features)
+#    - device: "cpu" or "cuda"
+#    - threshold: decision threshold for multi-label assignment
+#    - use_graph_features: include ontology-graph-derived features if available
+rf_learner = AlexbekRFLearner(
+    device="cpu",            # switch to "cuda" if you have a GPU
+    batch_size=16,
+    max_length=512,          # max tokenizer length for embedding model inputs
+    threshold=0.30,          # probability cutoff for assigning each type
+    use_graph_features=True  # set False for pure RF on text embeddings only
+)
+
+# Build the pipeline and pass raw structured objects end-to-end.
+pipe = LearnerPipeline(
+    retriever=rf_learner,
+    retriever_id="intfloat/e5-base-v2",   # or "Qwen/Qwen3-Embedding-4B" if you have sufficient GPU memory
+    ontologizer_data=True,                # True if data is already {"term": ..., "types": [...], ...}
+    device="cpu",
+    batch_size=16
+)
+
+# Run the full learning pipeline on the term-typing task
+outputs = pipe(
+    train_data=train_data,
+    test_data=test_data,
+    task="term-typing",
+    evaluate=True,
+    ontologizer_data=True,
+)
+
+# Display evaluation summary and runtime
+print("Metrics:", outputs.get("metrics"))
+
+print("Elapsed time:", outputs["elapsed_time"])
+
+print(ontology)
diff --git a/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py b/examples/llm_learner_alexbek_self_attn_taxonomy_discovery.py
@@ -0,0 +1,41 @@
+from ontolearner import GeoNames, train_test_split, LearnerPipeline
+from ontolearner import AlexbekCrossAttnLearner
+# 1) Load & split
+ontology = GeoNames()
+ontology.load()
+data = ontology.extract()
+train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
+
+# 2) Configure the cross-attention learner
+cross_learner = AlexbekCrossAttnLearner(
+    embedding_model="sentence-transformers/all-MiniLM-L6-v2",  # or "Qwen/Qwen2.5-1.5B-... (if wrapped as ST)"
+    device="cpu",
+    num_heads=8,
+    lr=5e-5,
+    weight_decay=0.01,
+    num_epochs=1,
+    batch_size=256,
+    neg_ratio=1.0,
+    output_dir="./results/crossattn/",
+    seed=42,
+)
+
+# 3) Build pipeline
+pipeline = LearnerPipeline(
+    llm=cross_learner,     # <- our learner
+    llm_id="cross-attn",   # label for bookkeeping
+    ontologizer_data=False # pass raw ontology objects as in your example
+)
+
+# 4) Train + predict + evaluate
+outputs = pipeline(
+    train_data=train_data,
+    test_data=test_data,
+    task="taxonomy-discovery",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+print("Metrics:", outputs.get("metrics"))
+print("Elapsed time:", outputs["elapsed_time"])
+print(outputs)
diff --git a/examples/llm_learner_alexbek_text2onto.py b/examples/llm_learner_alexbek_text2onto.py
@@ -0,0 +1,74 @@
+import os
+import json
+import torch
+
+# LocalAutoLLM handles model loading/generation; AlexbekFewShotLearner provides fit/predict APIs
+from ontolearner.learner.text2onto.alexbek import LocalAutoLLM, AlexbekFewShotLearner
+
+# Local folder where the dataset is stored (relative to this script)
+DATA_DIR = "./dataset_llms4ol_2025/TaskA-Text2Onto/ecology"
+
+# Input paths (already saved)
+TRAIN_DOCS_PATH        = os.path.join(DATA_DIR, "train", "documents.jsonl")
+TRAIN_TERMS2DOCS_PATH  = os.path.join(DATA_DIR, "train", "terms2docs.json")
+TEST_DOCS_FULL_PATH    = os.path.join(DATA_DIR, "test", "text2onto_ecology_test_documents.jsonl")
+
+# Output paths
+DOC_TERMS_OUT_PATH     = os.path.join(DATA_DIR, "test", "extracted_terms_ecology.fast.jsonl")
+TERMS2TYPES_OUT_PATH   = os.path.join(DATA_DIR, "test", "terms2types_pred_ecology.fast.json")
+TYPES2DOCS_OUT_PATH    = os.path.join(DATA_DIR, "test", "types2docs_pred_ecology.fast.json")
+
+# Device selection
+DEVICE = (
+    "cuda"
+    if torch.cuda.is_available()
+    else ("mps" if torch.backends.mps.is_available() else "cpu")
+)
+
+# Model config
+MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
+LOAD_IN_4BIT = (DEVICE == "cuda")  # 4-bit helps on GPU
+
+# 1) Load LLM
+llm = LocalAutoLLM(device=DEVICE)
+llm.load(MODEL_ID, load_in_4bit=LOAD_IN_4BIT)
+
+# 2) Build few-shot exemplars from training split
+learner = AlexbekFewShotLearner(model=llm, device=DEVICE)
+learner.fit(
+    train_docs_jsonl=TRAIN_DOCS_PATH,
+    terms2doc_json=TRAIN_TERMS2DOCS_PATH,
+    # use defaults for sample size/seed
+)
+
+# 3) Predict terms per test document
+os.makedirs(os.path.dirname(DOC_TERMS_OUT_PATH), exist_ok=True)
+num_written_doc_terms = learner.predict_terms(
+    docs_test_jsonl=TEST_DOCS_FULL_PATH,
+    out_jsonl=DOC_TERMS_OUT_PATH,
+    # use defaults for max_new_tokens and few_shot_k
+)
+print(f"[terms] wrote {num_written_doc_terms} lines → {DOC_TERMS_OUT_PATH}")
+
+# 4) Predict types for extracted terms, using the JSONL we just wrote
+typing_summary = learner.predict_types_from_terms(
+    doc_terms_jsonl=DOC_TERMS_OUT_PATH,   # read the predictions directly
+    doc_terms_list=None,                  # (not needed when doc_terms_jsonl is provided)
+    model_id=MODEL_ID,                    # reuse the same small model
+    out_terms2types=TERMS2TYPES_OUT_PATH,
+    out_types2docs=TYPES2DOCS_OUT_PATH,
+    # use defaults for everything else
+)
+
+print(f"[types] {typing_summary['unique_terms']} unique terms | {typing_summary['types_count']} types")
+print(f"[saved] {TERMS2TYPES_OUT_PATH}")
+print(f"[saved] {TYPES2DOCS_OUT_PATH}")
+
+# 5) Small preview of term→types
+try:
+    with open(TERMS2TYPES_OUT_PATH, "r", encoding="utf-8") as fin:
+        preview = json.load(fin)[:3]
+    print("[preview] first 3:")
+    print(json.dumps(preview, ensure_ascii=False, indent=2))
+except Exception as e:
+    print(f"[preview] skipped: {e}")
diff --git a/examples/llm_learner_rwthdbis_taxonomy_discovery.py b/examples/llm_learner_rwthdbis_taxonomy_discovery.py
@@ -0,0 +1,57 @@
+# Import core modules from the OntoLearner library
+from ontolearner import LearnerPipeline, train_test_split
+from ontolearner import ChordOntology, RWTHDBISTaxonomyLearner
+
+# Load the Chord ontology, which exposes hierarchical (parent, child) relations for taxonomy discovery
+ontology = ChordOntology()
+ontology.load()  # Read entities, type system, and taxonomic edges into memory
+
+# Extract typed taxonomic edges and split into train/test while preserving the structured shape
+train_data, test_data = train_test_split(
+    ontology.extract(),
+    test_size=0.2,
+    random_state=42
+)
+
+# Initialize a supervised taxonomy classifier (encoder-based fine-tuning)
+# Negative sampling controls the number of non-edge examples; bidirectional templates create both (p→c) and (c→p) views
+# Context features are optional and can be enabled with with_context=True and a JSON path of type descriptions
+learner = RWTHDBISTaxonomyLearner(
+    model_name="microsoft/deberta-v3-small",
+    output_dir="./results/",
+    num_train_epochs=1,
+    per_device_train_batch_size=8,
+    gradient_accumulation_steps=4,
+    learning_rate=2e-5,
+    max_length=256,
+    seed=42,
+    negative_ratio=5,
+    bidirectional_templates=True,
+    context_json_path=None,
+    ontology_name=ontology.ontology_full_name,
+)
+
+# Build the pipeline
+pipeline = LearnerPipeline(
+    llm=learner,
+    llm_id=learner.model_name,
+    ontologizer_data=False,
+)
+
+# # Run the full learning pipeline on the taxonomy-discovery task
+outputs = pipeline(
+    train_data=train_data,
+    test_data=test_data,
+    task="taxonomy-discovery",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs['metrics'])          # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs['elapsed_time'])
+
+# Print all returned outputs (include predictions)
+print(outputs)
diff --git a/examples/llm_learner_rwthdbis_term_typing.py b/examples/llm_learner_rwthdbis_term_typing.py
@@ -0,0 +1,50 @@
+# Import core modules from the OntoLearner library
+from ontolearner import LearnerPipeline, train_test_split, AgrO
+from ontolearner import RWTHDBISTermTypingLearner
+
+#load the AgrO ontology.
+# AgrO provides term-typing supervision where each term can be annotated with one or more types.
+ontology = AgrO()
+ontology.load()
+data = ontology.extract()
+
+# Split the labeled term-typing data into train and test sets
+train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
+
+# Configure a supervised encoder-based classifier for term typing.
+# This fine-tunes DeBERTa v3 on (term → type) signals; increase epochs for stronger results.
+learner = RWTHDBISTermTypingLearner(
+    model_name="microsoft/deberta-v3-small",
+    output_dir="./results/deberta-v3",
+    num_train_epochs=30,
+    per_device_train_batch_size=16,
+    gradient_accumulation_steps=2,
+    learning_rate=2e-5,
+    max_length=64,
+    seed=42,
+)
+
+# Build the pipeline and pass raw structured objects end-to-end.
+pipeline = LearnerPipeline(
+    llm=learner,
+    llm_id=learner.model_name,
+    ontologizer_data=False,
+)
+
+# Run the full learning pipeline on the term-typing task
+outputs = pipeline(
+    train_data=train_data,
+    test_data=test_data,
+    task="term-typing",
+    evaluate=True,
+    ontologizer_data=False,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs['metrics'])          # Shows {'precision': ..., 'recall': ..., 'f1_score': ...}
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs['elapsed_time'])
+
+# Print all returned outputs (include predictions)
+print(outputs)
diff --git a/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py b/examples/llm_learner_sbunlp_fs_taxonomy_discovery.py
@@ -0,0 +1,66 @@
+# Import core modules from the OntoLearner library
+from ontolearner import GeoNames, train_test_split, LearnerPipeline
+# Import the specific Few-Shot Learner implementation
+from ontolearner import SBUNLPFewShotLearner
+
+# Load ontology and split
+# Load the GeoNames ontology for taxonomy discovery.
+# GeoNames provides geographic parent-child relationships (is-a hierarchy).
+ontology = GeoNames()
+ontology.load()
+data = ontology.extract() # Extract the list of taxonomic relationships from the ontology object
+
+# Split the taxonomic relationships into train and test sets
+train_data, test_data = train_test_split(
+    data,
+    test_size=0.6, # 60% of data used for testing (terms to find relations for)
+    random_state=42,
+)
+
+# Configure the learner with user-defined inference args + device
+# Configure the SBUNLP Few-Shot Learner using the Qwen model.
+# This performs in-context learning via N x M batch prompting.
+llm_learner = SBUNLPFewShotLearner(
+    # Model / decoding
+    model_name="Qwen/Qwen2.5-0.5B-Instruct", # The Qwen model to load
+    try_4bit=True,              # uses 4-bit if bitsandbytes + CUDA available for memory efficiency
+    max_new_tokens=140,         # limit the length of the model's response (for JSON output)
+    max_input_tokens=1500,      # limit the total prompt length (context window)
+    temperature=0.0,            # set to 0.0 for deterministic output (best for structured JSON)
+    top_p=1.0,                  # top-p sampling disabled with temperature=0.0
+
+    # Grid settings (N x M prompts)
+    n_train_chunks=7,           # N: split training examples (few-shot context) into 7 chunks
+    m_test_chunks=7,            # M: split test terms (vocabulary) into 7 chunks (total 49 prompts)
+
+    # Run controls
+    limit_prompts=None,         # None runs all N x M prompts; set to an integer for a dry-run
+    output_dir="./outputs/taskC_batches",  # Optional: dump per-prompt JSON results for debugging
+)
+
+# Build pipeline and run
+# Build the pipeline, passing the Few-Shot Learner.
+pipe = LearnerPipeline(
+    llm=llm_learner,
+    llm_id=llm_learner.model_name,
+    ontologizer_data=True,      # Let the learner flatten structured ontology objects via its tasks_* helpers
+    device="auto",              # automatically select CUDA or CPU
+)
+
+# Run the full learning pipeline on the taxonomy-discovery task
+outputs = pipe(
+    train_data=train_data,
+    test_data=test_data,
+    task="taxonomy-discovery",
+    evaluate=True,
+    ontologizer_data=True,
+)
+
+# Display the evaluation results
+print("Metrics:", outputs.get("metrics"))
+
+# Display total elapsed time for training + prediction + evaluation
+print("Elapsed time:", outputs["elapsed_time"])
+
+# Print all returned outputs (include predictions)
+print(outputs)