liupeirong · h2floh · Nov 20, 2020 · Nov 20, 2020 · Nov 23, 2020 · Dec 3, 2020
diff --git a/.env.example b/.env.example
@@ -14,14 +14,28 @@ SUBSCRIPTION_ID = ''
 LOCATION = ''
 BASE_NAME = ''
 RESOURCE_GROUP = ''
+
+# Observability related
+APPLICATIONINSIGHTS_CONNECTION_STRING = ''
+LOG_TO_CONSOLE = 'false'
+# DEBUG, INFO, WARNING, ERROR, CRITICAL
+LOG_LEVEL = 'DEBUG' 
+# Probability 0.0 -> 1.0
+LOG_SAMPLING_RATE = '1.0'
+# Probability 0.0 -> 1.0
+TRACE_SAMPLING_RATE = '1.0'
+# Seconds
+METRICS_EXPORT_INTERVAL = '15'
+
+# Azure ML Workspace Variables
 WORKSPACE_NAME = ''
 ACI_DEPLOYMENT_NAME = ''
 
 ####################################################
 # Variables that are defined in variables-template.yml
 #   they determine _how_ the project runs 
 ####################################################
-SOURCES_DIR_TRAIN = 'ml_model'
+SOURCES_DIR_TRAIN = '.'
 EXPERIMENT_NAME = 'flower_classification'
 DATASET_NAME = 'flower_dataset'
 # Optional. Set it if you have configured non default datastore to point to your data

diff --git a/.pipelines/02-processing-data.yml b/.pipelines/02-processing-data.yml
@@ -51,6 +51,8 @@ stages:
           # Invoke the Python building and publishing a data preprocessing pipeline
           python -m ml_service.pipelines.build_data_processing_pipeline
       displayName: 'Publish Data Preprocessing Pipeline'
+      env:
+        APPLICATIONINSIGHTS_CONNECTION_STRING: $(APPLICATIONINSIGHTS_CONNECTION_STRING)
 
 # Trigger_Preprocessing_Pipeline
 - template: trigger-preprocessing-pipeline.yml

diff --git a/.pipelines/03-train-evaluate-register-model.yml b/.pipelines/03-train-evaluate-register-model.yml
@@ -62,6 +62,8 @@ stages:
           # Invoke the Python building and publishing a training pipeline
           python -m ml_service.pipelines.build_training_pipeline
       displayName: 'Publish Azure Machine Learning Pipeline'
+      env:
+        APPLICATIONINSIGHTS_CONNECTION_STRING: $(APPLICATIONINSIGHTS_CONNECTION_STRING)
 
 - stage: 'Trigger_Training_Pipeline'
   displayName: 'Train and evaluate model'
@@ -75,6 +77,7 @@ stages:
     container: mlops
     timeoutInMinutes: 0
     steps:
+    - template: update-ci-dependencies.yml
     - task: AzureCLI@1
       inputs:
         azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
@@ -89,6 +92,9 @@ stages:
           echo "##vso[task.setvariable variable=AMLPIPELINEID;isOutput=true]$AMLPIPELINEID"
       name: 'getpipelineid'
       displayName: 'Get Pipeline ID'
+      env:
+        APPLICATIONINSIGHTS_CONNECTION_STRING: $(APPLICATIONINSIGHTS_CONNECTION_STRING)
+
   - job: "Run_ML_Pipeline"
     dependsOn: "Get_Pipeline_ID"
     displayName: "Trigger ML Training Pipeline"

diff --git a/.pipelines/04-deploy-model-aci.yml b/.pipelines/04-deploy-model-aci.yml
@@ -67,3 +67,5 @@ stages:
           set -e # fail on error
           export SUBSCRIPTION_ID=$(az account show --query id -o tsv)
           python -m ml_service.util.smoke_test_scoring_service --service "$(ACI_DEPLOYMENT_NAME)"
+      env:
+        APPLICATIONINSIGHTS_CONNECTION_STRING: $(APPLICATIONINSIGHTS_CONNECTION_STRING)
diff --git a/.pipelines/07-processing-data-os-cmd.yml b/.pipelines/07-processing-data-os-cmd.yml
@@ -51,6 +51,8 @@ stages:
           # Invoke the Python building and publishing a data preprocessing pipeline
           python -m ml_service.pipelines.build_data_processing_os_cmd_pipeline
       displayName: 'Publish Data Preprocessing OS cmd Pipeline'
+      env:
+        APPLICATIONINSIGHTS_CONNECTION_STRING: $(APPLICATIONINSIGHTS_CONNECTION_STRING)
 
 # Trigger_Preprocessing_Pipeline
 - template: trigger-preprocessing-pipeline.yml

diff --git a/.pipelines/code-quality-template.yml b/.pipelines/code-quality-template.yml
@@ -1,5 +1,7 @@
 # Pipeline template to run linting, unit tests with code coverage, and publish the results.
 steps:
+- template: update-ci-dependencies.yml
+
 - script: |   
    flake8 --output-file=lint-testresults.xml --format junit-xml
   displayName: 'Run lint tests'

diff --git a/.pipelines/trigger-preprocessing-pipeline.yml b/.pipelines/trigger-preprocessing-pipeline.yml
@@ -16,6 +16,7 @@ stages:
     container: mlops
     timeoutInMinutes: 0
     steps:
+    - template: update-ci-dependencies.yml
     - task: AzureCLI@1
       inputs:
         azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
@@ -30,6 +31,9 @@ stages:
           echo "##vso[task.setvariable variable=PREPROCESSPIPELINEID;isOutput=true]$PREPROCESSPIPELINEID"
       name: 'getpreprocessingpipelineid'
       displayName: 'Get Preprocessing Pipeline ID of ${{ parameters.aml_pipeline_name }}'
+      env:
+        APPLICATIONINSIGHTS_CONNECTION_STRING: $(APPLICATIONINSIGHTS_CONNECTION_STRING)
+
   - job: "Run_Data_Processing_Pipeline"
     dependsOn: "Get_Preprocessing_Pipeline_ID"
     displayName: "Trigger Preprocessing Pipeline ${{ parameters.aml_pipeline_name }}"

diff --git a/.pipelines/update-ci-dependencies.yml b/.pipelines/update-ci-dependencies.yml
@@ -0,0 +1,5 @@
+steps:
+# This step ensures that the latest ci dependencies are applied to the build agent
+- script: |   
+    conda env update -f ml_model/ci_dependencies.yml -n ci
+  displayName: 'Update missing dependencies for current branch on build agent'
diff --git a/.pipelines/variables-template.yml b/.pipelines/variables-template.yml
@@ -4,7 +4,7 @@
 variables:
   # The directory containing the scripts for training, evaluating, and registering the model
   - name: SOURCES_DIR_TRAIN
-    value: ml_model
+    value: '.'
 
   # Azure ML Variables
   - name: EXPERIMENT_NAME
@@ -32,8 +32,8 @@ variables:
   - name: ALLOW_RUN_CANCEL
     value: "false"
   # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
-  # - name: AML_REBUILD_ENVIRONMENT
-  #  value: "false"
+  - name: AML_REBUILD_ENVIRONMENT
+    value: "true"
 
   # AML Environment Config
   - name: AML_ENV_NAME
@@ -42,6 +42,8 @@ variables:
     value: flower_custom_preprocess_env
 
   # AML Compute Cluster Config
+  - name: AML_ENV_TRAIN_CONDA_DEP_FILE
+    value: "ml_model/conda_dependencies.yml"
   - name: AML_COMPUTE_CLUSTER_CPU_SKU
     value: STANDARD_DS2_V2
   - name: AML_COMPUTE_CLUSTER_NAME
@@ -52,6 +54,24 @@ variables:
     value: 0
   - name: AML_CLUSTER_MAX_NODES
     value: 4
+  - name: AML_CLUSTER_PRIORITY
+    value: lowpriority
+
+  # Observability related
+  - name: LOG_TO_CONSOLE
+    value: 'false'
+  - name: LOG_LEVEL
+    value: 'INFO' # DEBUG, INFO, WARNING, ERROR, CRITICAL
+  - name: LOG_SAMPLING_RATE
+    value: '1.0' # Probability 0.0 -> 1.0
+  - name: TRACE_SAMPLING_RATE
+    value: '1.0' # Probability 0.0 -> 1.0
+  - name: METRICS_EXPORT_INTERVAL
+    value: '15' # Seconds
+
+  # The name for the (docker/webapp) scoring image
+  - name: IMAGE_NAME
+    value: "flowerclassifier"
 
   # AML pipelines can run outside of Azure DevOps, these parameters control AML pipeline behaviors
   - name: PREPROCESSING_PARAM

diff --git a/ml_model/ci_dependencies.yml b/ml_model/ci_dependencies.yml
@@ -26,4 +26,11 @@ dependencies:
       - tensorflow==2.3.*
       - keras==2.4.*
 
+      # Observability
+      - dataclasses==0.6
+      - opencensus==0.7.11
+      - opencensus-ext-httplib==0.7.3
+      - opencensus-ext-logging==0.1.0
+      - opencensus-context==0.1.2
+      - opencensus-ext-azure==1.0.5
 
diff --git a/ml_model/conda_dependencies.yml b/ml_model/conda_dependencies.yml
@@ -23,3 +23,12 @@ dependencies:
       # Training deps
       - tensorflow==2.3.*
       - keras==2.4.*
+
+      # Observability
+      - python-dotenv==0.12.*
+      - dataclasses==0.6
+      - opencensus==0.7.11
+      - opencensus-ext-httplib==0.7.3
+      - opencensus-ext-logging==0.1.0
+      - opencensus-context==0.1.2
+      - opencensus-ext-azure==1.0.5
diff --git a/ml_model/dev_dependencies.yml b/ml_model/dev_dependencies.yml
@@ -29,4 +29,10 @@ dependencies:
       - keras==2.4.*
       - debugpy
 
-
+      # Observability
+      - dataclasses==0.6
+      - opencensus==0.7.11
+      - opencensus-ext-httplib==0.7.3
+      - opencensus-ext-logging==0.1.0
+      - opencensus-context==0.1.2
+      - opencensus-ext-azure==1.0.5
diff --git a/ml_model/evaluate/evaluate_model.py b/ml_model/evaluate/evaluate_model.py
@@ -25,7 +25,8 @@
 """
 from azureml.core import Run
 import argparse
-from util.model_helper import get_model
+from ml_model.util.model_helper import get_model
+from ml_service.util.logger.observability import observability
 
 
 def evaluate_model_performs_better(model, run):
@@ -37,13 +38,14 @@ def evaluate_model_performs_better(model, run):
     if (production_model_accuracy is None or new_model_accuracy is None):
         raise Exception(f"Unable to find {metric_eval} metrics, exiting evaluation")  # NOQA: E501
     else:
-        print(f"Current model accuracy: {production_model_accuracy}, new model accuracy: {new_model_accuracy}")  # NOQA: E501
+        observability.log(f"Current model accuracy: {production_model_accuracy}, new model accuracy: {new_model_accuracy}")  # NOQA: E501
 
     if (new_model_accuracy > production_model_accuracy):
-        print("New model performs better, register it")
+        observability.log("New model performs better, register it")
         return True
     else:
-        print("New model doesn't perform better, skip registration")
+        observability.log("New model doesn't perform better,"
+                          " skip registration")
         return False
 
 
@@ -91,8 +93,15 @@ def main():
         if(not should_register and (allow_run_cancel).lower() == 'true'):
             run.parent.cancel()
     else:
-        print("This is the first model, register it")
+        observability.log("This is the first model, register it")
 
 
 if __name__ == '__main__':
-    main()
+    observability.start_span('evaluate_model')
+    try:
+        main()
+    except Exception as exception:
+        observability.exception(exception)
+        raise exception
+    finally:
+        observability.end_span()
diff --git a/ml_model/preprocessing/Dockerfile b/ml_model/preprocessing/Dockerfile
@@ -25,8 +25,18 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}
     ~/miniconda/bin/conda clean -tipsy
 ENV PATH="/home/dockeruser/miniconda/bin/:${PATH}"
 
+USER root
+
+RUN apt-get update --fix-missing && \
+    apt-get install -y build-essential
+
+# Create conda environment for dockeruser user
+USER dockeruser
+
 RUN conda install -y conda=${CONDA_VERSION} python=${PYTHON_VERSION} && \
     pip install azureml-defaults==${AZUREML_SDK_VERSION} inference-schema==${INFERENCE_SCHEMA_VERSION} &&\
+    pip install python-dotenv==0.12.* dataclasses==0.6 opencensus==0.7.11 opencensus-ext-httplib==0.7.3 \
+    opencensus-ext-azure==1.0.5 opencensus-ext-logging==0.1.0 opencensus-context==0.1.2 && \
     conda clean -aqy && \
     rm -rf ~/miniconda/pkgs && \
     find ~/miniconda/ -type d -name __pycache__ -prune -exec rm -rf {} \;
diff --git a/ml_model/preprocessing/preprocess_aml.py b/ml_model/preprocessing/preprocess_aml.py
@@ -26,12 +26,13 @@
 from azureml.core.run import Run
 import argparse
 import json
-from preprocess_images import resize_images
-from util.model_helper import get_or_register_dataset, get_aml_context
+from ml_model.preprocessing.preprocess_images import resize_images
+from ml_model.util.model_helper import get_or_register_dataset, get_aml_context
+from ml_service.util.logger.observability import observability
 
 
 def main():
-    print("Running preprocess.py")
+    observability.log("Running preprocess.py")
 
     parser = argparse.ArgumentParser("preprocess")
     parser.add_argument(
@@ -69,11 +70,12 @@ def main():
 
     args = parser.parse_args()
 
-    print("Argument [dataset_name]: %s" % args.dataset_name)
-    print("Argument [datastore_name]: %s" % args.datastore_name)
-    print("Argument [data_file_path]: %s" % args.data_file_path)
-    print("Argument [output_dataset]: %s" % args.output_dataset)
-    print("Argument [preprocessing_param]: %s" % args.preprocessing_param)
+    observability.log("Argument [dataset_name]: %s" % args.dataset_name)
+    observability.log("Argument [datastore_name]: %s" % args.datastore_name)
+    observability.log("Argument [data_file_path]: %s" % args.data_file_path)
+    observability.log("Argument [output_dataset]: %s" % args.output_dataset)
+    observability.log("Argument [preprocessing_param]: %s"
+                      % args.preprocessing_param)
 
     data_file_path = args.data_file_path
     dataset_name = args.dataset_name
@@ -85,12 +87,12 @@ def main():
     aml_workspace, *_ = get_aml_context(run)
 
     if preprocessing_param is None or preprocessing_param == "":
-        with open("parameters.json") as f:
+        with open("ml_model/parameters.json") as f:
             pars = json.load(f)
             preprocessing_args = pars["preprocessing"]
     else:
         preprocessing_args = json.loads(preprocessing_param)
-    print(f"preprocessing parameters {preprocessing_args}")
+    observability.log(f"preprocessing parameters {preprocessing_args}")
     for (k, v) in preprocessing_args.items():
         run.log(k, v)
         run.parent.log(k, v)
@@ -107,15 +109,22 @@ def main():
     # Process data
     mount_context = dataset.mount()
     mount_context.start()
-    print(f"mount_point is: {mount_context.mount_point}")
+    observability.log(f"mount_point is: {mount_context.mount_point}")
     resize_images(mount_context.mount_point, output_dataset, preprocessing_args)  # NOQA: E501
     mount_context.stop()
 
     run.tag("run_type", value="preprocess")
-    print(f"tags now present for run: {run.tags}")
+    observability.log(f"tags now present for run: {run.tags}")
 
     run.complete()
 
 
 if __name__ == '__main__':
-    main()
+    observability.start_span('preprocess_aml')
+    try:
+        main()
+    except Exception as exception:
+        observability.exception(exception)
+        raise exception
+    finally:
+        observability.end_span()