iptk
diff --git a/‎iptk/dataset.py
+12-38 b/‎iptk/dataset.py
+12-38
diff --git a/‎iptk/dataset_store.py
+75-8 b/‎iptk/dataset_store.py
+75-8
diff --git a/‎iptk/docker_utils.py
+71-39 b/‎iptk/docker_utils.py
+71-39
diff --git a/‎iptk/job.py
+68 b/‎iptk/job.py
+68
@@ -1,43 +1,17 @@
-import os, shutil
+import os, re, shutil
 
 class Dataset(object):
     """
-    Instances of the Dataset class represent IPTK datasets on disk. A valid 
-    path must be passed to the instance upon creation. An IPTK dataset will be
-    created at the given path if it does not exist already.
+    Instances of the Dataset class represent IPTK datasets in an abstract 
+    fashion. A concrete dataset is specified by the combination of a pair of
+    a Dataset and a DatasetStore instance. 
     """
-    def __init__(self, path):
+    def __init__(self, identifier, store=None):
         super(Dataset, self).__init__()
-        self.path = path
-        self.initialize()
-    
-    @property
-    def data_path(self):
-        return os.path.join(self.path, 'data')
-    
-    def initialize(self):
-        """
-        Initializes the dataset by creating an empty IPTK dataset structure
-        including the data/, temp/, and meta/ directories. After 
-        initialization, the dataset can be edited until it is locked by a call
-        to the lock() method or the removal of the temp/ directory.
-        Initializing an existing dataset is a no-op.
-        """
-        if not os.path.exists(self.data_path):
-            subdirs = ["temp", "data", "meta"]
-            for s in subdirs:
-                os.makedirs(os.path.join(self.path, s), exist_ok=True)
-        
-    def lock(self):
-        """
-        Locks the dataset. Locked datasets can be used as job inputs but the
-        content of their data/ directory must remain unchanged. A locked 
-        dataset is indicated by the absence of a temp/ subdirectory. Unlocking
-        a dataset by re-creating temp/ is not allowed and may lead to 
-        unpleasant side effects. Locking a locked dataset is a no-op.
-        """
-        tmp_dir = os.path.join(self.path, 'temp')
-        if os.path.exists(tmp_dir):
-            shutil.rmtree(tmp_dir)
-    
-    
+        if not re.match("^[0-9a-z]{40}$", identifier):
+            raise ValueError('Invalid dataset identifier')
+        self.identifier = identifier
+        self.store = store
+
+    def __repr__(self):
+        return f"<{self.__class__.__name__} {self.identifier}>"
@@ -1,4 +1,4 @@
-import os, re
+import json, os, re, shutil
 from .dataset import Dataset
 
 class DatasetStore(object):
@@ -7,6 +7,51 @@ def __init__(self, root_path):
         super(DatasetStore, self).__init__()
         self.root_path = root_path
 
+    def get_path(self, dataset):
+        """
+        Returns the path of the requested dataset on disk.
+        """
+        subdir = "/".join(list(dataset.identifier[:4]))
+        path = os.path.join(self.root_path, subdir, dataset.identifier)
+        return path
+
+    def get_data_path(self, dataset, mutable=None):
+        """
+        Returns the full path to the given dataset's data/ directory. If the
+        mutable argument is set, an exception is raised for locked datasets (if
+        mutable == True) or unlocked datasets (if mutable == False).
+        """
+        if (mutable is not None) and (self.is_locked(dataset) == mutable):
+            raise ValueError("Dataset lock state does not match request")
+        dataset_path = self.get_path(dataset)
+        return os.path.join(dataset_path, "data")
+        
+    def get_meta_path(self, dataset, spec):
+        dataset_path = self.get_path(dataset)
+        return os.path.join(dataset_path, "meta", f"{spec.identifier}.json")
+        
+    def get_metadata(self, dataset, spec):
+        """
+        Returns the metadata saved within this store for the given combination
+        of dataset and metadata specification.
+        """
+        path = self.get_meta_path(dataset, spec)
+        if not os.path.exists(path):
+            return None
+        with open(path, "r") as f:
+            return json.load(f)
+
+    def set_metadata(self, dataset, spec, data):
+        """
+        Sets the metadata for the given dataset and metadata specification. The
+        data object must be JSON-serializable. Note that your changes may be
+        overwritten if the metadata specification is associated with a metadata
+        generator.
+        """
+        path = self.get_meta_path(dataset, spec)
+        with open(path, "w+") as f:
+            return json.dump(data, f, sort_keys=True, indent=4, separators=(',', ': '))
+
     def get_dataset(self, dataset_id, create_ok=False):
         """
         Fetch a Dataset object backed by this DatasetStore. Raises a value
@@ -15,10 +60,32 @@ def get_dataset(self, dataset_id, create_ok=False):
         method can optionally create an empty dataset if no dataset with the
         given identifier exists. 
         """
-        if not re.match("^[0-9a-z]{40}$", dataset_id):
-            raise ValueError('Invalid dataset identifier')
-        subdir = "/".join(list(dataset_id[:4]))
-        path = os.path.join(self.root_path, subdir, dataset_id)
-        if not os.path.exists(path) and not create_ok:
-            raise ValueError('No existing dataset at path and create_ok is False')
-        return Dataset(path)
+        dataset = Dataset(dataset_id, self)
+        path = self.get_path(dataset)
+        if not os.path.exists(path):
+            if create_ok:
+                subdirs = ["temp", "data", "meta"]
+                for s in subdirs:
+                    os.makedirs(os.path.join(path, s), exist_ok=True)
+            else:
+                raise ValueError("Dataset not found in this store")
+        return dataset
+    
+    def is_locked(self, dataset):
+        tmp_dir = os.path.join(self.get_path(dataset), 'temp')
+        return not os.path.exists(tmp_dir)
+        
+    def lock_dataset(self, dataset):
+        """
+        Locks the dataset. Locked datasets can be used as job inputs but the
+        content of their data/ directory must remain unchanged. A locked 
+        dataset is indicated by the absence of a temp/ subdirectory. Unlocking
+        a dataset by re-creating temp/ is not allowed and may lead to 
+        unpleasant side effects. Locking a locked dataset is a no-op.
+        """
+        tmp_dir = os.path.join(self.get_path(dataset), 'temp')
+        if os.path.exists(tmp_dir):
+            shutil.rmtree(tmp_dir)
+
+    def __repr__(self):
+        return f"<{self.__class__.__name__} {self.root_path}>"
@@ -1,45 +1,77 @@
 #!/usr/local/bin/python3
 import requests
 
-def get_digest(registry, repository, tag):
-    manifest_url = f"https://{registry}/v2/{repository}/manifests/{tag}"
-    headers = {"Accept": "application/vnd.docker.distribution.manifest.v2+json"}
-    r = requests.head(manifest_url, headers=headers)
-    if r.status_code == 401:
-        token_url = f"https://auth.docker.io/token"
-        token_params = {
-            "service": "registry.docker.io",
-            "scope": f"repository:{repository}:pull"
+class DockerImage(object):
+    """
+    Represents a specific Docker image in IPTK. While this can be constructed
+    from a human-readable image reference, the reference will be resolved to a
+    digest value on instance creation.
+    """
+    def __init__(self, reference):
+        super(DockerImage, self).__init__()
+        self.resolve(reference)
+    
+    @classmethod
+    def from_dict(cls, specification):
+        registry = specification["registry"]
+        repository = specification["repository"]
+        digest = specification["digest"]
+        reference = f"{registry}/{repository}@{digest}"
+        return cls(reference)
+    
+    @property
+    def spec(self):
+        spec = {
+            "registry": self.registry,
+            "repository": self.repository,
+            "digest": self.digest
         }
-        token = requests.get(token_url, params=token_params, json=True).json()["token"]
-        headers["Authorization"] = f"Bearer {token}"
+        return spec
+        
+    
+    def get_digest(self, registry, repository, tag):
+        manifest_url = f"https://{registry}/v2/{repository}/manifests/{tag}"
+        headers = {"Accept": "application/vnd.docker.distribution.manifest.v2+json"}
         r = requests.head(manifest_url, headers=headers)
-    return r.headers['Docker-Content-Digest']
+        if r.status_code == 401:
+            token_url = f"https://auth.docker.io/token"
+            token_params = {
+                "service": "registry.docker.io",
+                "scope": f"repository:{repository}:pull"
+            }
+            token = requests.get(token_url, params=token_params, json=True).json()["token"]
+            headers["Authorization"] = f"Bearer {token}"
+            r = requests.head(manifest_url, headers=headers)
+        return r.headers['Docker-Content-Digest']
 
-def get_parts(reference):
-    # Docker default values
-    registry = "registry-1.docker.io"
-    repository = reference
-    tag = "latest"
-    digest = None
-    # Parse domain part, if any
-    if "/" in reference:
-        domain, remainder = reference.split("/", 1)
-        if domain == "localhost" or "." in domain or ":" in domain:
-            registry = domain
-            repository = remainder
-    # Separate image reference and digest
-    if "@" in repository:
-        repository, digest = repository.split("@", 1)
-    # See if image contains a tag
-    if ":" in repository:
-        repository, tag = repository.split(":", 1)
-    # Handle "familiar" Docker references
-    if registry == "registry-1.docker.io" and "/" not in repository:
-        repository = "library/" + repository
-    if not digest:
-        digest = get_digest(registry, repository, tag)
-    return (registry, repository, tag, digest)
-
-def get_reference(registry, repository, digest):
-    return f"{registry}/{repository}@{digest}"
+    def resolve(self, reference):
+        # Docker default values
+        registry = "registry-1.docker.io"
+        repository = reference
+        tag = "latest"
+        digest = None
+        # Parse domain part, if any
+        if "/" in repository:
+            domain, remainder = repository.split("/", 1)
+            if domain == "localhost" or "." in domain or ":" in domain:
+                registry = domain
+                repository = remainder
+        # Separate image reference and digest
+        if "@" in repository:
+            repository, digest = repository.split("@", 1)
+        # See if image contains a tag
+        if ":" in repository:
+            repository, tag = repository.split(":", 1)
+        # Handle "familiar" Docker references
+        if registry == "registry-1.docker.io" and "/" not in repository:
+            repository = "library/" + repository
+        if not digest:
+            digest = self.get_digest(registry, repository, tag)
+        self.registry = registry
+        self.repository = repository
+        self.tag = tag
+        self.digest = digest
+        self.reference = f"{registry}/{repository}@{digest}"
+    
+    def __repr__(self):
+        return f"<{self.__class__.__name__} {self.reference}>"
@@ -0,0 +1,68 @@
+from datetime import datetime
+from .json_utils import json_hash
+from .metadata_spec import MetadataSpec
+from .mount import Mount
+from .docker_utils import DockerImage
+
+class Job(object):
+    """
+    IPTK jobs take a variable amount of input datasets and a Docker image
+    specification. The IPTK runner will create a container from the given image
+    and mount the input datasets as specified. An output dataset will 
+    automatically be created and it's data/ directory will be mounted in the
+    container at /output. The output dataset will automatically be locked after
+    the container has finished executing.
+    """
+    def __init__(self, image, mounts, command=None):
+        super(Job, self).__init__()
+        self.image = image
+        self.mounts = mounts
+        self.command = command
+
+    @classmethod
+    def from_dict(cls, specification):
+        image = DockerImage.from_dict(specification["image"])
+        mounts = []
+        for m in specification["mounts"]:
+            mounts.append(Mount.from_dict(m))
+        command = specification.get("command", None)
+        return cls(image, mounts, command)
+
+    @property
+    def minimal_spec(self):
+        spec = {
+            "command": self.command,
+            "image": self.image.spec,
+            "mounts": [m.spec for m in self.mounts]
+        }
+        return spec
+        
+    @property
+    def spec(self):
+        spec = self.minimal_spec
+        return spec
+    
+    @property
+    def identifier(self):
+        """
+        A job's identifier is also the identifier of the resulting dataset if
+        the jobs is executed by the IPTK runner.
+        """
+        return json_hash(self.minimal_spec)
+        
+    def save(self, store):
+        """
+        Save this job to a dataset store. This creates a new dataset with this
+        job's specification stored in it's metadata. An IPTK runner can then be
+        used to run the job.
+        """
+        meta_spec = MetadataSpec("University of Münster", "IPTK Job", 3)
+        dataset = store.get_dataset(self.identifier, create_ok=True)
+        current_job_spec = store.get_metadata(dataset, meta_spec)
+        if current_job_spec:
+            current_job = Job.from_dict(current_job_spec)
+            if self.minimal_spec != current_job.minimal_spec:
+                raise Exception("Different job exists with equal identifier")
+        store.set_metadata(dataset, meta_spec, self.spec)
+
+