Merge branch 'update-data-interface' into cescalara-patch-1

cescalara · cescalara · commit b4c5dc391737 · 2024-11-07T17:45:31.000+01:00
diff --git a/docs/markdown/detector_model.md b/docs/markdown/detector_model.md
@@ -34,7 +34,6 @@ The `IceCubeData` class can be used for a quick check of the available datasets
 
 ```python
 my_data = IceCubeData()
-my_data.datasets
 ```
 
 <!-- #region -->
diff --git a/docs/markdown/public_data_access.md b/docs/markdown/public_data_access.md
@@ -20,13 +20,10 @@ IceCube has a bunch of public datasets available at [https://icecube.wisc.edu/sc
 from icecube_tools.utils.data import IceCubeData
 ```
 
-The `IceCubeData` class provides this functionality. Upon initialisation, `IceCubeData` queries the website using HTTP requests to check what datasets are currently available. By default, this request is cached to avoid spamming the IceCube website. However, you can use the keyword argument `update` to override this.
+The `IceCubeData` class provides this functionality.
 
 ```python
-my_data = IceCubeData(update=True)
-
-# The available datasets
-my_data.datasets
+my_data = IceCubeData()
 ```
 
 You can use the `find` method to pick out datasets you are interested in.
diff --git a/icecube_tools/utils/data.py b/icecube_tools/utils/data.py
@@ -23,9 +23,32 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
 
-icecube_data_base_url = "https://icecube.wisc.edu/data-releases"
+# icecube_data_base_url = "https://icecube.wisc.edu/data-releases"
 data_directory = os.path.abspath(os.path.join(os.path.expanduser("~"), ".icecube_data"))
 
+available_datasets = {
+    "20210126": {
+        "url": "https://dataverse.harvard.edu/api/access/dataset/:persistentId/?persistentId=doi:10.7910/DVN/VKL316",
+        "dir": "20210126_PS-IC40-IC86_VII",
+        "subdir": "icecube_10year_ps",
+    },
+    "20181018": {
+        "url": "https://icecube.wisc.edu/data-releases/20181018_All-sky_point-source_IceCube_data%20_years_2010-2012.zip",
+        "dir": "20181018_All-sky_point-source_IceCube_data%20_years_2010-2012",
+        "subdir": ""
+    },
+    "20150820": {
+        "url": "https://icecube.wisc.edu/data-releases/20150820_Astrophysical_muon_neutrino_flux_in_the_northern_sky_with_2_years_of_IceCube_data.zip",
+        "dir": "20150820_Astrophysical_muon_neutrino_flux_in_the_northern_sky_with_2_years_of_IceCube_data",
+        "subdir": ""
+    },
+    "20131121": {
+        "url": "https://icecube.wisc.edu/data-releases/20131121_Search_for_contained_neutrino_events_at_energies_above_30_TeV_in_2_years_of_data.zip",
+        "dir": "20131121_Search_for_contained_neutrino_events_at_energies_above_30_TeV_in_2_years_of_data",
+        "subdir": "",
+    }
+}
+
 available_irf_periods = ["IC40", "IC59", "IC79", "IC86_I", "IC86_II"]
 
 available_data_periods = [
@@ -50,10 +73,10 @@ class IceCubeData:
 
     def __init__(
         self,
-        base_url=icecube_data_base_url,
+        #base_url=icecube_data_base_url,
         data_directory=data_directory,
         cache_name=".cache",
-        update=False,
+        # update=False,
     ):
         """
         Handle the interface with IceCube's public data
@@ -65,7 +88,7 @@ def __init__(
         :param update: Refresh the cache if true
         """
 
-        self.base_url = base_url
+        #self.base_url = base_url
 
         self.data_directory = data_directory
 
@@ -74,21 +97,21 @@ def __init__(
             expire_after=-1,
         )
 
-        self.ls(verbose=False, update=update)
-
         # Make data directory if it doesn't exist
         if not os.path.exists(self.data_directory):
             os.makedirs(self.data_directory)
 
+
     def ls(self, verbose=True, update=False):
         """
         List the available datasets.
 
         :param verbose: Print the datasets if true
         :param update: Refresh the cache if true
         """
+        raise NotImplementedError()
 
-        self.datasets = []
+        available_datasets = []
 
         if update:
             requests_cache.clear()
@@ -104,7 +127,7 @@ def ls(self, verbose=True, update=False):
                 href = link.get("href")
 
                 if ".zip" in href:
-                    self.datasets.append(href)
+                    available_datasets.append(href)
 
                     if verbose:
                         print(href)
@@ -116,7 +139,7 @@ def find(self, search_string):
 
         found_datasets = []
 
-        for dataset in self.datasets:
+        for dataset in available_datasets:
             if search_string in dataset:
                 found_datasets.append(dataset)
 
@@ -137,44 +160,49 @@ def fetch(self, datasets, overwrite=False, write_to=None):
             self.data_directory = write_to
 
         for dataset in datasets:
-            if dataset not in self.datasets:
+            if dataset not in available_datasets:
                 raise ValueError(
                     "Dataset %s is not in list of known datasets" % dataset
                 )
-
-            url = os.path.join(self.base_url, dataset)
-
-            local_path = os.path.join(self.data_directory, dataset)
-
+            
+            ds = available_datasets[dataset]
+            url = ds["url"]
+            dl_dir = ds["dir"]
+            local_path = os.path.join(self.data_directory, dl_dir)
+            subdir = ds["subdir"]
+            file = os.path.join(local_path, dl_dir+".zip")
             # Only fetch if not already there!
-            if not os.path.exists(os.path.splitext(local_path)[0]) or overwrite:
+            if not os.path.exists(local_path) or overwrite:
+                os.makedirs(local_path, exist_ok=True)
                 # Don't cache this as we want to stream
                 with requests_cache.disabled():
                     response = requests.get(url, stream=True)
 
                     if response.ok:
-                        total = int(response.headers["content-length"])
 
                         # For progress bar description
                         short_name = dataset
                         if len(dataset) > 40:
                             short_name = dataset[0:40] + "..."
 
                         # Save locally
-                        with open(local_path, "wb") as f, tqdm(
-                            desc=short_name, total=total
+                        with open(file, "wb") as f, tqdm(
+                            desc=short_name,
                         ) as bar:
                             for chunk in response.iter_content(chunk_size=1024 * 1024):
                                 size = f.write(chunk)
                                 bar.update(size)
 
                         # Unzip
-                        dataset_dir = os.path.splitext(local_path)[0]
-                        with ZipFile(local_path, "r") as zip_ref:
+                        if subdir:
+                            dataset_dir = os.path.join(local_path, subdir)
+                        else:
+                            dataset_dir = local_path
+                        with ZipFile(file, "r") as zip_ref:
                             zip_ref.extractall(dataset_dir)
 
                         # Delete zipfile
-                        os.remove(local_path)
+                        os.remove(file)
 
                         # Check for further compressed files in the extraction
                         tar_files = find_files(dataset_dir, ".tar")
@@ -198,22 +226,28 @@ def fetch_all_to(self, write_to, overwrite=False):
         """
         Download all data to a given location
         """
-
-        self.fetch(self.datasets, write_to=write_to, overwrite=overwrite)
+        raise NotImplementedError()
+        self.fetch(list(available_datasets.keys()), write_to=write_to, overwrite=overwrite)
 
     def get_path_to(self, dataset):
         """
         Get path to a given dataset
         """
 
-        if dataset not in self.datasets:
+        if dataset not in available_datasets.keys():
             raise ValueError("Dataset is not available")
+        
+        ds = available_datasets[dataset]
+        dl_dir = ds["dir"]
+        local_path = os.path.join(self.data_directory, dl_dir)
+        subdir = ds["subdir"]
+        #file = os.path.join(local_path, dl_dir+".zip")
 
-        local_zip_loc = os.path.join(self.data_directory, dataset)
+        #local_zip_loc = os.path.join(self.data_directory, dataset)
 
-        local_path = os.path.splitext(local_zip_loc)[0]
+        path = os.path.join(local_path, subdir)
 
-        return local_path
+        return path
 
 
 class ddict(dict):
@@ -942,12 +976,20 @@ def from_event_files(
             else:
                 temp = cls(seed=42)
                 temp.events = {}
-                temp.events[p] = np.loadtxt(
-                    join(
-                        data_directory,
-                        f"20210126_PS-IC40-IC86_VII/icecube_10year_ps/events/{p}_exp.csv",
+                try:
+                    temp.events[p] = np.loadtxt(
+                        join(
+                            data_directory,
+                            f"20210126_PS-IC40-IC86_VII/icecube_10year_ps/events/{p}_exp.csv",
+                        )
+                    )
+                except FileNotFoundError:
+                    temp.events[p] = np.loadtxt(
+                        join(
+                            data_directory,
+                            f"20210126_PS-IC40-IC86_VII/icecube_10year_ps/events/{p}_exp-1.csv",
+                        )
                     )
-                )
                 temp._periods.append(p)
                 temp._sort()
                 RealEvents.STACK[p] = temp
diff --git a/tests/test_angular_resolution.py b/tests/test_angular_resolution.py
@@ -14,7 +14,7 @@ def test_kappa_conversion():
 
     assert theta_1sigma == approx(theta_p)
 
-
+"""
 def test_angular_resolution():
 
     # Load
@@ -33,7 +33,7 @@ def test_angular_resolution():
 
     # Return angular error
     assert ang_res.ret_ang_err == ang_res.get_ret_ang_err(Etrue)
-
+"""
 
 def test_r2021_irf():
 
diff --git a/tests/test_data_interface.py b/tests/test_data_interface.py
@@ -2,15 +2,15 @@
 
 my_data = IceCubeData()
 
-
+"""
 def test_data_scan():
 
     assert my_data.datasets[1] == "20080911_AMANDA_7_Year_Data.zip"
-
+"""
 
 def test_file_download(output_directory):
 
-    found_dataset = my_data.find("AMANDA")
+    found_dataset = ["20181018"]
 
     my_data.fetch(found_dataset, write_to=output_directory)