Skip to content

Commit

Permalink
add mirrors to datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
mam10eks committed Dec 1, 2024
1 parent eb0f3b0 commit e516e69
Show file tree
Hide file tree
Showing 14 changed files with 404 additions and 57 deletions.
2 changes: 2 additions & 0 deletions application/src/tira_app/endpoints/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
from ._runs import endpoints as run_endpoints
from ._systems import endpoints as system_endpoints
from ._tasks import endpoints as task_endpoints
from ._tirex import endpoints as tirex_endpoints
from ._user import endpoints as user_endpoints

endpoints = [
path("anonymous/", include(anonymous_endpoints)),
path("datasets/", include(dataset_endpoints)),
path("systems/", include(system_endpoints)),
path("tirex/", include(tirex_endpoints)),
path("evaluations/", include(evaluation_endpoints)),
path("organizers/", include(organizer_endpoints)),
path("runs/", include(run_endpoints)),
Expand Down
89 changes: 88 additions & 1 deletion application/src/tira_app/endpoints/v1/_datasets.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,44 @@
import json
from hashlib import md5
from pathlib import Path

import requests
from django.conf import settings
from django.urls import path
from rest_framework import pagination
from rest_framework.permissions import AllowAny
from rest_framework.serializers import CharField, ModelSerializer
from rest_framework.serializers import CharField, ModelSerializer, SerializerMethodField
from rest_framework_json_api.views import ModelViewSet

from ... import model as modeldb


class DatasetSerializer(ModelSerializer):
id = CharField(source="dataset_id")
mirrors = SerializerMethodField()
default_task_name = SerializerMethodField()

class Meta:
model = modeldb.Dataset
fields = [
"id",
"dataset_id",
"default_task",
"default_task_name",
"display_name",
"is_confidential",
"is_deprecated",
"ir_datasets_id",
"chatnoir_id",
"mirrors",
]

def get_mirrors(self, obj):
return mirrors_for_dataset(obj.dataset_id)

def get_default_task_name(self, obj):
return obj.default_task.task_name if obj.default_task else None


class _DatasetView(ModelViewSet):
queryset = modeldb.Dataset.objects.all()
Expand All @@ -32,6 +48,77 @@ class _DatasetView(ModelViewSet):
permission_classes = [AllowAny]


def load_mirrored_resource(md5_sum):
ret = None

obj = modeldb.MirroredResource.objects.filter(md5_sum=md5_sum).first()
ret = {"md5_sum": obj.md5_sum, "md5_first_kilobyte": obj.md5_first_kilobyte, "size": obj.size}
ret["mirrors"] = {}
ret["mirrors"] = json.loads(obj.mirrors)

return ret


def mirrors_for_dataset(dataset_id: str):
ret = {}
for i in modeldb.DatasetHasMirroredResource.objects.filter(dataset__dataset_id=dataset_id):
i = load_mirrored_resource(i.mirrored_resource.md5_sum)
if not i or not i["mirrors"]:
continue
for k, v in i["mirrors"].items():
ret[k] = v
return ret


def add_mirrored_resource(dataset_id: str, url: str, name: str):
for i in modeldb.DatasetHasMirroredResource.objects.filter(dataset__dataset_id=dataset_id):
i = load_mirrored_resource(i.mirrored_resource.md5_sum)
if not i:
raise ValueError("could not read existing resources")
if url in i["mirrors"].values():
print(f"Mirrored URL {url} already exists: {i}")
return

dataset = modeldb.Dataset.objects.filter(dataset_id=dataset_id).first()
response = requests.get(url)

if response.status_code != 200 or not response.ok:
raise ValueError(f"Failed to load {url}. Response code {response.status_code}.")

md5_sum = str(md5(response.content).hexdigest())
md5_first_kilobyte = str(md5(response.content[:1024]).hexdigest())
size = len(response.content)

target_dir = Path(settings.TIRA_ROOT) / "data" / "mirrored-resources"
target_dir.mkdir(exist_ok=True, parents=True)
target_dir = target_dir / md5_sum

mirrors = {}
existing_resource = load_mirrored_resource(md5_sum)

if not existing_resource:
with open(target_dir, "wb") as f:
f.write(response.content)

if existing_resource and "mirrors" in existing_resource and existing_resource["mirrors"]:
mirrors = existing_resource["mirrors"]

mirrors[name] = url

if not existing_resource:
modeldb.MirroredResource.objects.create(
md5_sum=md5_sum, md5_first_kilobyte=md5_first_kilobyte, size=size, mirrors=json.dumps(mirrors)
)
else:
modeldb.MirroredResource.objects.update(md5_sum=md5_sum, mirrors=json.dumps(mirrors))

mirror = modeldb.MirroredResource.objects.filter(md5_sum=md5_sum).first()
if not modeldb.DatasetHasMirroredResource.objects.filter(dataset=dataset, mirrored_resource=mirror):
modeldb.DatasetHasMirroredResource.objects.create(dataset=dataset, mirrored_resource=mirror)

print(load_mirrored_resource(md5_sum))


endpoints = [
path("", _DatasetView.as_view({"get": "list"})),
path("all", _DatasetView.as_view({"get": "list"})),
Expand Down
95 changes: 95 additions & 0 deletions application/src/tira_app/endpoints/v1/_tirex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from django.urls import path
from rest_framework.decorators import api_view
from rest_framework.request import Request
from rest_framework.response import Response

from ... import model as modeldb


@api_view(["GET"])
def topics(request: Request, dataset_id: str) -> Response:
"""Get topics for the specified dataset id.
Args:
request (Request): The request that triggered the REST API call.
dataset_id (str): The TIRA dataset id for which the topics should be returned
Returns:
Response: The topics.
"""
ret = []
import ir_datasets

dataset = modeldb.Dataset.objects.get(dataset_id=dataset_id)

if not dataset.ir_datasets_id:
raise ValueError(f'No ir dataset id specified for TIRA dataset "{dataset_id}".')

ir_dataset = ir_datasets.load(dataset.ir_datasets_id)
queries_iter = ir_dataset.queries_iter()
for q in queries_iter:
ret += [{"qid": q.query_id, "dataset_id": dataset_id, "default_text": q.default_text()}]

return Response(ret)


@api_view(["GET"])
def topic(request: Request, dataset_id: str, qid: str) -> Response:
"""Get topic for the specified dataset id.
Args:
request (Request): The request that triggered the REST API call.
dataset_id (str): The TIRA dataset id for which the topic should be returned
qid (str): The query id of the topic that should be returned
Returns:
Response: The topics.
"""
ret = []
import ir_datasets

dataset = modeldb.Dataset.objects.get(dataset_id=dataset_id)

if not dataset.ir_datasets_id:
raise ValueError(f'No ir dataset id specified for TIRA dataset "{dataset_id}".')

ir_dataset = ir_datasets.load(dataset.ir_datasets_id)
queries_iter = ir_dataset.queries_iter()
for q in queries_iter:
if str(q.query_id) == qid:
ret = {"qid": qid, "dataset_id": dataset_id, "default_text": q.default_text(), "docs": {}}
try:
ret["description"] = q.description
ret["narrative"] = q.narrative
except:
pass
return Response(ret)

raise ValueError(f"No topic found with id {qid}.")


@api_view(["GET"])
def run_by_uuid(request: Request, run_uuid: str) -> Response:
"""Get meta data for the specified run.
Args:
request (Request): The request that triggered the REST API call.
run_uuid (str): The UUID for the run
Returns:
Response: The topics.
"""
dataset_id = "clueweb09-en-trec-web-2009-20230107-training"
run_id = f"uuid-{run_uuid}"
ranking = {}
for i in range(0, 50):
ranking[str(i)] = [{"rank": j, "score": 10 - j, "doc_id": f"doc-{j}"} for j in range(1, 11)]

return Response({"tira_run": run_id, "dataset": dataset_id, "team": "", "run": run_id, "ranking": ranking})


endpoints = [
path("topics/<str:dataset_id>", topics),
path("topic/<str:dataset_id>/<str:qid>", topic),
path("runs-by-uuid/<str:run_uuid>", run_by_uuid),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from django.core.management.base import BaseCommand

from tira_app.endpoints.v1._datasets import add_mirrored_resource


class Command(BaseCommand):
"""Add a mirror to a dataset."""

def handle(self, *args, **options):
if "dataset_id" not in options or not options["dataset_id"]:
raise ValueError("Please pass --dataset_id")

if "url" not in options or not options["url"]:
raise ValueError("Please pass --url")

if "name" not in options or not options["name"]:
raise ValueError("Please pass --name")

add_mirrored_resource(options["dataset_id"], options["url"], options["name"])

def add_arguments(self, parser):
parser.add_argument("--dataset_id", default=None, type=str)
parser.add_argument("--url", default=None, type=str)
parser.add_argument("--name", default=None, type=str)
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Generated by Django 5.0.9 on 2024-12-01 10:33

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("tira", "0003_dataset_chatnoir_id_dataset_ir_datasets_id"),
]

operations = [
migrations.CreateModel(
name="MirroredResource",
fields=[
("md5_sum", models.CharField(max_length=150, primary_key=True, serialize=False)),
("md5_first_kilobyte", models.CharField(max_length=150)),
("size", models.BigIntegerField()),
("mirrors", models.CharField(default=None, max_length=500, null=True)),
],
),
migrations.CreateModel(
name="DatasetHasMirroredResource",
fields=[
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("dataset", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="tira.dataset")),
(
"mirrored_resource",
models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to="tira.mirroredresource"),
),
],
),
]
12 changes: 12 additions & 0 deletions application/src/tira_app/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,18 @@ class Meta:
unique_together = (("task_id", "dataset_id"),)


class MirroredResource(models.Model):
md5_sum = models.CharField(max_length=150, primary_key=True)
md5_first_kilobyte = models.CharField(max_length=150)
size = models.BigIntegerField()
mirrors = models.CharField(max_length=500, null=True, default=None)


class DatasetHasMirroredResource(models.Model):
dataset = models.ForeignKey(Dataset, on_delete=models.CASCADE)
mirrored_resource = models.ForeignKey(MirroredResource, on_delete=models.CASCADE)


class Software(models.Model):
software_id = models.CharField(max_length=150)
vm = models.ForeignKey(VirtualMachine, on_delete=models.CASCADE)
Expand Down
7 changes: 5 additions & 2 deletions frontend/src/ClaimSubmission.vue
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
<v-btn>Claim Ownersip</v-btn>
<div class="py-2"></div>
<v-divider/>
<!--<h3 class="my-1">Inspect Submission</h3>
<run-page :dataset_id="dataset.dataset_id" :chatnoir_id="dataset.chatnoir_id" :run_uuid="uuid"/>-->
</div>
</div>
</v-container>
Expand All @@ -59,10 +61,11 @@ import { inject } from 'vue'
import { get, chatNoirUrl, irDatasetsUrl, type UserInfo, type DatasetInfo, type ClaimSubmissionInfo } from './utils';
import { Loading, TiraBreadcrumb } from './components'
import RunPage from './tirex/RunPage.vue'
export default {
name: "claim-submission",
components: { Loading, TiraBreadcrumb },
components: { Loading, TiraBreadcrumb, RunPage },
data() {
return {
userinfo: inject('userinfo') as UserInfo,
Expand Down
14 changes: 8 additions & 6 deletions frontend/src/Datasets.vue
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@

<v-data-table :headers="headers_xs" :items="datasets" :itemsPerPage="10" :search="query" density="compact" fixed-footer>
<template #item.display_name="{ item }">
<a v-if="item.default_task" :href="'/task-overview/' + item.default_task.task_name + '/' + item.dataset_id" style="text-decoration: none !important;">{{ item.display_name }}</a>
<a v-if="item.default_task" :href="'/task-overview/' + item.default_task + '/' + item.dataset_id" style="text-decoration: none !important;">{{ item.display_name }}</a>
<span v-if="!item.default_task">{{ item.display_name }}</span>
</template>
<template #item.default_task="{ item }">
<a v-if="item.default_task" :href="'/task-overview/' + item.default_task.task_name" style="text-decoration: none !important;">{{ item.default_task.task_name }}</a>
<span v-if="!item.default_task"> No Task </span>
<a v-if="item.default_task" :href="'/task-overview/' + item.default_task + '/' + item.dataset_id" style="text-decoration: none !important;">{{ item.default_task_name }}</a>
<span v-if="!item.default_task">No Task</span>
</template>
<template #item.ir_datasets_id="{ item }">
<a v-if="ir_datasets_url(item)" :href="ir_datasets_url(item)" style="text-decoration: none !important;" target="_blank">{{item.ir_datasets_id}}</a>
Expand All @@ -29,11 +29,13 @@
<a v-if="chatnoir_url(item)" :href="chatnoir_url(item)" style="text-decoration: none !important;" target="_blank">ChatNoir</a>
</template>
<template #item.type="{ item }">
Public Training
<span v-if="item.is_confidential">Private Test</span>
<span v-if="!item.is_confidential">Public Training</span>
</template>
<template #item.mirrors="{ item }">
<!--<a v-if="item.default_task" href="https://chatnoir.web.webis.de" style="text-decoration: none !important;">Zenodo</a>
<a v-if="!item.default_task" href="https://chatnoir.web.webis.de" style="text-decoration: none !important;">Huggingface</a>-->
<p v-for="[k, v] of Object.entries(item.mirrors)">
<a :href="v + ''" style="text-decoration: none !important;" target="_blank">{{k}}</a>
</p>
</template>
</v-data-table>
</div>
Expand Down
5 changes: 0 additions & 5 deletions frontend/src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,8 @@ export default function register_app() {
{ path: '/:pathMatch(.*)*', component: Home },
]

console.log(tiraConf.rest_endpoint)
fetchWellKnownAPIs(tiraConf.rest_endpoint).then(wellKnown => { console.log(tiraConf.rest_endpoint + ' -> ' + wellKnown) })
fetchWellKnownAPIs('https://tira.io').then(wellKnown => { console.log('https://tira.io -> ' + wellKnown) })

fetchWellKnownAPIs(tiraConf.rest_endpoint).then(wellKnown => {
if (wellKnown.archived.toLowerCase().includes('://' + location.host.toLowerCase())) {
console.log('This client only works on the archived backup of TIRA.')
wellKnown.grpc = wellKnown.archived
wellKnown.api = wellKnown.archived
}
Expand Down
Loading

0 comments on commit e516e69

Please sign in to comment.