nsidc · eigenbeam · Oct 9, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 19, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__
-dist
+dist
+example/test.ini
diff --git a/README.md b/README.md
@@ -75,6 +75,58 @@ With the Poetry shell running, start the instameta tool and verify that it’s w
       init
       process
 
+## AWS Credentials
+
+In order to process science data and stage it for Cumulus, you must first create & setup your AWS
+credentials. Several options for doing this are given here:
+
+### Manually Creating Configuration Files
+
+First, create a directory in your user's home directory to store the AWS configuration:
+
+    $ mkdir -p ~/.aws
+
+In the `~/.aws` directory, create a file named `config` with the contents:
+
+    [default]
+    region = us-west-2
+    output = json
+
+In the `~/.aws` directory, create a file named `credentials` with the contents:
+
+    [default]
+    aws_access_key_id = TBD
+    aws_secret_access_key = TBD
+
+Finally, restrict the permissions of the directory and files:
+
+    $ chmod -R go-rwx ~/.aws
+
+When you obtain the AWS key pair (not covered here), edit the `~/.aws/credentials` file
+and replace `TBD` with the public and secret key values.
+
+### Using the AWS CLI
+
+You may install (or already have it installed) the AWS Command Line Interface on the
+machine where you are running the tool. Follow the 
+[AWS CLI Install instructions](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html)
+for the platform on which you are running.
+
+Once you have the AWS CLI, you can use it to create the `~/.aws` directory and the
+`config` and `credentials` files:
+
+    $ aws configure
+
+You will be prompted to enter your AWS public access and secret key values, along with
+the AWS region and CLI output format. The AWS CLI will create and populate the directory
+and files with your values.
+
+If you require access to multiple AWS accounts, each with their own configuration--for
+example, different accounts for pre-production vs. production--you can use the AWS CLI
+'profile' feature to manage settings for each account. See the [AWS configuration 
+documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html#cli-configure-files-using-profiles)
+for the details.
+
 ## Usage
 
 * Show the help text:
@@ -89,6 +141,12 @@ With the Poetry shell running, start the instameta tool and verify that it’s w
 
         $ instameta info --config example/modscg.ini
 
+* Process science data and stage it for Cumulus:
+
+        # Source the AWS profile (once) before running 'process'-- use 'default' or a named profile
+        $ source scripts/env.sh default
+        $ instameta process --config example/modscg.ini
+
 * Exit the Poetry shell:
 
         $ exit
@@ -114,10 +172,14 @@ TBD
 
         $ poetry install
 
-### Running tests:
+### Run tests:
 
         $ poetry run pytest
 
+### Run tests when source changes (uses [pytest-watcher](https://github.com/olzhasar/pytest-watcher)):
+
+        $ poetry run ptw . --now --clear
+
 ## Credit
 
 This content was developed by the National Snow and Ice Data Center with funding from

diff --git a/example/modscg.ini b/example/modscg.ini
@@ -9,5 +9,8 @@ provider = FORTY_TWO
 [Destination]
 local_output_dir = ./json
 ummg_dir = ummg
-kinesis_arn = abcd-1234-wxyz-0666
+kinesis_stream_name = abcd-${environment}-1234-wxyz-0666
+write_cnm_file = True
 
+[Settings]
+checksum_type = SHA256
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,11 +12,14 @@ python = "^3.12"
 click = "^8.1.7"
 pyfiglet = "^1.0.2"
 netCDF4 = "^1.6.5"
-
 rich = "^13.7.1"
+boto3 = "^1.35.22"
+
 [tool.poetry.group.test.dependencies]
 pytest = "^8.3.2"
+moto = {extras = ["all"], version = "^5.0.14"}
 
+pytest-watcher = "^0.4.3"
 [tool.poetry.group.dev.dependencies]
 ruff = "^0.5.5"
 mypy = "^1.11.1"

diff --git a/scripts/env.sh b/scripts/env.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+if (( $# != 1 )); then
+    echo "Usage: source env.sh aws_profile_name"
+    echo "       where aws_profile_name is an AWS CLI named profile"
+    echo "       https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html"
+    exit 1
+else
+    export AWS_PROFILE=$1
+
+    AWS_ACCESS_KEY_ID=$(aws configure get aws_access_key_id --profile "$AWS_PROFILE")
+    AWS_SECRET_ACCESS_KEY=$(aws configure get aws_secret_access_key --profile "$AWS_PROFILE")
+    AWS_REGION=$(aws configure get region --profile "$AWS_PROFILE" || echo "$AWS_DEFAULT_REGION")
+    AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query "Account" --output text)
+
+    export AWS_ACCESS_KEY_ID
+    export AWS_SECRET_ACCESS_KEY
+    export AWS_REGION
+    export AWS_ACCOUNT_ID
+
+    echo "AWS environment:"
+    echo "  AWS_PROFILE:          $AWS_PROFILE"
+    echo "  AWS_REGION:           $AWS_REGION"
+    echo "  AWS_ACCOUNT_ID:       $AWS_ACCOUNT_ID"
+fi
diff --git a/src/nsidc/metgen/aws.py b/src/nsidc/metgen/aws.py
@@ -0,0 +1,27 @@
+import boto3
+
+
+KINESIS_PARTITION_KEY = "metgenc-duck"
+
+def kinesis_stream_exists(stream_name):
+    client = boto3.client("kinesis", region_name="us-west-2")
+    try:
+        summary = client.describe_stream_summary(StreamName=stream_name)
+        return True
+    except Exception as e:
+        return False
+
+def post_to_kinesis(stream_name, cnm_message):
+    """Posts a message to a Kinesis stream."""
+    client = boto3.client("kinesis", region_name="us-west-2")
+    try:
+        result = client.put_record(
+            StreamName=stream_name,
+            Data=cnm_message,
+            PartitionKey=KINESIS_PARTITION_KEY
+        )
+        print(f'Published CNM message {cnm_message} to stream: {stream_name}')
+        return result['ShardId']
+    except Exception as e:
+        print(e)
+        raise e
diff --git a/src/nsidc/metgen/cli.py b/src/nsidc/metgen/cli.py
@@ -1,5 +1,6 @@
 import click
 
+from nsidc.metgen import config
 from nsidc.metgen import metgen
 from nsidc.metgen import constants
 
@@ -12,29 +13,39 @@ def cli():
     pass
 
 @cli.command()
-@click.option('--config', help='Path to configuration file to create or replace')
+@click.option('-c', '--config', help='Path to configuration file to create or replace')
 def init(config):
     """Populates a configuration file based on user input."""
     click.echo(metgen.banner())
     config = metgen.init_config(config)
     click.echo(f'Initialized the metgen configuration file {config}')
 
 @cli.command()
-@click.option('--config', help='Path to configuration file to display', required=True)
-def info(config):
+@click.option('-c', '--config', 'config_filename', help='Path to configuration file to display', required=True)
+def info(config_filename):
     """Summarizes the contents of a configuration file."""
     click.echo(metgen.banner())
-    configuration = metgen.configuration(metgen.config_parser(config))
+    configuration = config.configuration(config.config_parser_factory(config_filename), {})
     configuration.show()
 
 @cli.command()
-@click.option('--config', help='Path to configuration file', required=True)
-@click.option('--env', help='environment', default=constants.DEFAULT_CUMULUS_ENVIRONMENT, show_default=True)
-def process(config, env=constants.DEFAULT_CUMULUS_ENVIRONMENT):
+@click.option('-c', '--config', 'config_filename', help='Path to configuration file', required=True)
+@click.option('-e', '--env', help='environment', default=constants.DEFAULT_CUMULUS_ENVIRONMENT, show_default=True)
+@click.option('-n', '--number', help="Process at most 'count' granules.", metavar='count', required=False, default=-1)
+@click.option('-wc', '--write-cnm', is_flag=True, help="Write CNM messages to files.")
+def process(config_filename, env, write_cnm, number):
     """Processes science data files based on configuration file contents."""
     click.echo(metgen.banner())
-    configuration = metgen.configuration(metgen.config_parser(config), env)
-    metgen.process(configuration)
+    overrides = {
+        'write_cnm_file': write_cnm,
+        'number': number
+    }
+    configuration = config.configuration(config.config_parser_factory(config_filename), overrides, env)
+    try:
+        metgen.process(configuration)
+    except Exception as e:
+        print("\nUnable to process data: " + str(e))
+        exit(1)
     click.echo(f'Processed granules using the configuration file {config}')
 
 if __name__ == "__main__":

diff --git a/src/nsidc/metgen/config.py b/src/nsidc/metgen/config.py
@@ -0,0 +1,104 @@
+import configparser
+import dataclasses
+from datetime import datetime, timezone
+import os.path
+import uuid
+
+from nsidc.metgen import aws
+from nsidc.metgen import constants
+
+
+@dataclasses.dataclass
+class Config:
+    environment: str
+    data_dir: str
+    auth_id: str
+    version: str
+    provider: str
+    local_output_dir: str
+    ummg_dir: str
+    kinesis_stream_name: str
+    write_cnm_file: bool
+    checksum_type: str
+    number: int
+
+    def show(self):
+        # TODO add section headings in the right spot (if we think we need them in the output)
+        print()
+        print('Using configuration:')
+        for k,v in self.__dict__.items():
+            print(f'  + {k}: {v}')
+
+    def enhance(self, producer_granule_id):
+        mapping = dataclasses.asdict(self)
+        collection_details = self.collection_from_cmr(mapping)
+
+        mapping['auth_id'] = collection_details['auth_id']
+        mapping['version'] = collection_details['version']
+        mapping['producer_granule_id'] = producer_granule_id
+        mapping['submission_time'] = datetime.now(timezone.utc).isoformat()
+        mapping['uuid'] = str(uuid.uuid4())
+
+        return mapping
+
+    # Is the right place for this function?
+    def collection_from_cmr(self, mapping):
+        # TODO: Use auth_id and version from mapping object to retrieve collection
+        # metadata from CMR, including formatted version number, temporal range, and
+        # spatial coverage.
+        return {
+            'auth_id': mapping['auth_id'],
+            'version': mapping['version']
+        }
+
+def config_parser_factory(configuration_file):
+    if configuration_file is None or not os.path.exists(configuration_file):
+        raise ValueError(f'Unable to find configuration file {configuration_file}')
+    cfg_parser = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
+    cfg_parser.read(configuration_file)
+    return cfg_parser
+
+
+def _get_configuration_value(environment, section, name, value_type, config_parser, overrides, default=None):
+    vars = { 'environment': environment }
+    if overrides.get(name) is None:
+        if value_type is bool:
+            return config_parser.getboolean(section, name, fallback=default)
+        elif value_type is int:
+            return config_parser.getint(section, name, fallback=default)
+        else:
+            value = config_parser.get(section, name, vars=vars, fallback=default)
+            print(name, vars, value)
+            return value
+    else:
+        return overrides.get(name)
+
+def configuration(config_parser, overrides, environment=constants.DEFAULT_CUMULUS_ENVIRONMENT):
+    try:
+        return Config(
+            environment,
+            _get_configuration_value(environment, 'Source', 'data_dir', str, config_parser, overrides),
+            _get_configuration_value(environment, 'Collection', 'auth_id', str, config_parser, overrides),
+            _get_configuration_value(environment, 'Collection', 'version', int, config_parser, overrides),
+            _get_configuration_value(environment, 'Collection', 'provider', str, config_parser, overrides),
+            _get_configuration_value(environment, 'Destination', 'local_output_dir', str, config_parser, overrides),
+            _get_configuration_value(environment, 'Destination', 'ummg_dir', str, config_parser, overrides),
+            _get_configuration_value(environment, 'Destination', 'kinesis_stream_name', str, config_parser, overrides),
+            _get_configuration_value(environment, 'Destination', 'write_cnm_file', bool, config_parser, overrides, False),
+            _get_configuration_value(environment, 'Settings', 'checksum_type', str, config_parser, overrides, 'SHA256'),
+            _get_configuration_value(environment, 'Settings', 'number', int, config_parser, overrides, -1),
+        )
+    except Exception as e:
+        return Exception('Unable to read the configuration file', e)
+
+def validate(configuration):
+    """Validates each value in the configuration."""
+    validations = [
+        ['data_dir', lambda dir: os.path.exists(dir), 'The data_dir does not exist.'],
+        ['local_output_dir', lambda dir: os.path.exists(dir), 'The local_output_dir does not exist.'],
+        # ['ummg_dir', lambda dir: os.path.exists(dir), 'The ummg_dir does not exist.'],                 ## Not sure what validation to do
+        ['kinesis_stream_name', lambda name: aws.kinesis_stream_exists(name), 'The kinesis stream does not exist.'],
+    ]
+    errors = [msg for name, fn, msg in validations if not fn(getattr(configuration, name))]
+    return len(errors) == 0, errors
+