From d96c3030cabe613db6b3aede20679f373b046a6d Mon Sep 17 00:00:00 2001 From: olalekanelesin Date: Thu, 11 Jan 2018 17:43:30 +0100 Subject: [PATCH 1/5] Replaced specifying options to yaml config file --- utilities/Hive_metastore_migration/README.md | 35 +++++++---------- .../artifacts/config.yaml | 8 ++++ .../src/hive_metastore_migration.py | 39 ++++++++++++++++++- 3 files changed, 59 insertions(+), 23 deletions(-) create mode 100644 utilities/Hive_metastore_migration/artifacts/config.yaml diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md index a20ffe53..999afc08 100644 --- a/utilities/Hive_metastore_migration/README.md +++ b/utilities/Hive_metastore_migration/README.md @@ -186,35 +186,26 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. 2. Submit the `hive_metastore_migration.py` Spark script to your Spark cluster using the following parameters: - - Set `--direction` to `from_metastore`, or omit the argument since - `from_metastore` is the default. + - Set `--config_file` to `` (default path: `artifacts/config.yaml`) + + - Provide the following configuration parameters in the configuration yaml file: + ``` + * mode + * jdbc-url + * jdbc-username + * jdbc-password + * database-prefix + * table-prefix + ``` - - Provide the JDBC connection information through these arguments: - `--jdbc-url`, `--jdbc-username`, and `--jdbc-password`. - - - The argument `--output-path` is required. It is either a local file system location - or an S3 location. If the output path is a local directory, you can upload the data - to an S3 location manually. If it is an S3 path, you need to make sure that the Spark - cluster has EMRFS library in its class path. The script will export the metadata to a - subdirectory of the output-path you provided. - - - `--database-prefix` and `--table-prefix` (optional) to set a string prefix that is applied to the - database and table names. They are empty by default. - - Example spark-submit command to migrate Hive metastore to S3, tested on EMR-4.7.1: - ```bash + ```bash MYSQL_JAR_PATH=/usr/lib/hadoop/mysql-connector-java-5.1.42-bin.jar DRIVER_CLASSPATH=/home/hadoop/*:/etc/hadoop/conf:/etc/hive/conf:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:$MYSQL_JAR_PATH spark-submit --driver-class-path $DRIVER_CLASSPATH \ --jars $MYSQL_JAR_PATH \ /home/hadoop/hive_metastore_migration.py \ - --mode from-metastore \ - --jdbc-url jdbc:mysql://metastore.foo.us-east-1.rds.amazonaws.com:3306 \ - --jdbc-user hive \ - --jdbc-password myJDBCPassword \ - --database-prefix myHiveMetastore_ \ - --table-prefix myHiveMetastore_ \ - --output-path s3://mybucket/myfolder/ + --config_file artifacts/config.yaml ``` - If the job finishes successfully, it creates 3 sub-folders in the S3 output path you diff --git a/utilities/Hive_metastore_migration/artifacts/config.yaml b/utilities/Hive_metastore_migration/artifacts/config.yaml new file mode 100644 index 00000000..81a82093 --- /dev/null +++ b/utilities/Hive_metastore_migration/artifacts/config.yaml @@ -0,0 +1,8 @@ +mode: +jdbc-url: +jdbc-username: +jdbc-password: +database-prefix: +table-prefix: +output-path: +input_path: \ No newline at end of file diff --git a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py index 16562256..b6447f1d 100644 --- a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py +++ b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py @@ -14,6 +14,7 @@ # except for python 2.7 standard library and Spark 2.1 import sys import argparse +import yaml import re import logging from time import localtime, strftime @@ -1398,6 +1399,39 @@ def parse_arguments(args): return options +def parse_arguments_from_yaml_file(args): + """ + This function accepts the path to a config file + and extracts the needed arguments for the metastore migration + ---------- + Return: + Dictionary of config options + """ + parser = argparse.ArgumentParser(prog=args[0]) + parser.add_argument('-f', '--config_file', required=True, default='artifacts/config.yaml`', help='Provide yaml configuration file path to read migration arguments from. Default path: `artifacts/config.yaml`') + options = get_options(parser, args) + config_file_path = options['config_file'] + ## read the yaml file + with open(config_file_path, 'r') as yaml_file_stream: + config_options = yaml.load(yaml_file_stream) + + if config_options['mode'] == FROM_METASTORE: + validate_options_in_mode( + options=config_options, mode=FROM_METASTORE, + required_options=['output_path'], + not_allowed_options=['input_path'] + ) + elif config_options['mode'] == TO_METASTORE: + validate_options_in_mode( + options=config_options, mode=TO_METASTORE, + required_options=['input_path'], + not_allowed_options=['output_path'] + ) + else: + raise AssertionError('unknown mode ' + options['mode']) + + return config_options + def get_spark_env(): conf = SparkConf() sc = SparkContext(conf=conf) @@ -1501,7 +1535,10 @@ def validate_aws_regions(region): def main(): - options = parse_arguments(sys.argv) + # options = parse_arguments(sys.argv) + + ## This now reads options from path to config yaml file + options = parse_arguments_from_yaml_file(sys.argv) connection = { 'url': options['jdbc_url'], From 4156b728423c59f4beacd44a8c8f3db4e0484b18 Mon Sep 17 00:00:00 2001 From: Ryo Manabe Date: Tue, 27 May 2025 17:15:41 +0900 Subject: [PATCH 2/5] * Made YAML file configuration optional for backward compatibility * Resolved the warning message with yaml.load() : https://github.com/yaml/pyyaml/wiki/PyYAML-yaml.load(input)-Deprecation --- utilities/Hive_metastore_migration/README.md | 43 ++++++++--- .../artifacts/config.yaml | 8 -- .../src/hive_metastore_migration.py | 75 ++++++++----------- 3 files changed, 61 insertions(+), 65 deletions(-) delete mode 100644 utilities/Hive_metastore_migration/artifacts/config.yaml diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md index 80ec95d9..ae543d90 100644 --- a/utilities/Hive_metastore_migration/README.md +++ b/utilities/Hive_metastore_migration/README.md @@ -191,17 +191,30 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. 2. Submit the `hive_metastore_migration.py` Spark script to your Spark cluster using the following parameters: - - Set `--config_file` to `` (default path: `artifacts/config.yaml`) - - - Provide the following configuration parameters in the configuration yaml file: - ``` - * mode - * jdbc-url - * jdbc-username - * jdbc-password - * database-prefix - * table-prefix - ``` + - Set `--direction` to `from_metastore`, or omit the argument since + `from_metastore` is the default. + + - Provide the JDBC connection information through these arguments: + `--jdbc-url`, `--jdbc-username`, and `--jdbc-password`. + + - The argument `--output-path` is required. It is either a local file system location + or an S3 location. If the output path is a local directory, you can upload the data + to an S3 location manually. If it is an S3 path, you need to make sure that the Spark + cluster has EMRFS library in its class path. The script will export the metadata to a + subdirectory of the output-path you provided. + + - `--database-prefix` and `--table-prefix` (optional) to set a string prefix that is applied to the + database and table names. They are empty by default. + + - Optionally, you can set `--config_file` to `` which contains configuration parameters. + - Provide the following configuration parameters in the configuration yaml file: + * mode: + * jdbc_url: + * jdbc_username: + * jdbc_password: + * database_prefix: + * table_prefix: + * output_path: - Example spark-submit command to migrate Hive metastore to S3, tested on EMR-4.7.1: ```bash @@ -210,7 +223,13 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. spark-submit --driver-class-path $DRIVER_CLASSPATH \ --jars $MYSQL_JAR_PATH \ /home/hadoop/hive_metastore_migration.py \ - --config_file artifacts/config.yaml + --mode from-metastore \ + --jdbc-url jdbc:mysql://metastore.foo.us-east-1.rds.amazonaws.com:3306 \ + --jdbc-user hive \ + --jdbc-password myJDBCPassword \ + --database-prefix myHiveMetastore_ \ + --table-prefix myHiveMetastore_ \ + --output-path s3://mybucket/myfolder/ ``` - If the job finishes successfully, it creates 3 sub-folders in the S3 output path you diff --git a/utilities/Hive_metastore_migration/artifacts/config.yaml b/utilities/Hive_metastore_migration/artifacts/config.yaml deleted file mode 100644 index 81a82093..00000000 --- a/utilities/Hive_metastore_migration/artifacts/config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -mode: -jdbc-url: -jdbc-username: -jdbc-password: -database-prefix: -table-prefix: -output-path: -input_path: \ No newline at end of file diff --git a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py index 2a8dfabc..3115a6ed 100644 --- a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py +++ b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py @@ -1581,25 +1581,46 @@ def get_options(parser, args): def parse_arguments(args): + """ + parse arguments for the metastore migration. + If a yaml file is provided, it will override any parameters specified on the command line. + ---------- + Return: + Dictionary of config options + """ parser = argparse.ArgumentParser(prog=args[0]) - parser.add_argument("-m", "--mode", required=True, choices=[FROM_METASTORE, TO_METASTORE], help="Choose to migrate metastore either from JDBC or from S3") - parser.add_argument("-U", "--jdbc-url", required=True, help="Hive metastore JDBC url, example: jdbc:mysql://metastore.abcd.us-east-1.rds.amazonaws.com:3306") - parser.add_argument("-u", "--jdbc-username", required=True, help="Hive metastore JDBC user name") - parser.add_argument("-p", "--jdbc-password", required=True, help="Hive metastore JDBC password") + parser.add_argument("-m", "--mode", required=False, choices=[FROM_METASTORE, TO_METASTORE], help="Choose to migrate metastore either from JDBC or from S3") + parser.add_argument("-U", "--jdbc-url", required=False, help="Hive metastore JDBC url, example: jdbc:mysql://metastore.abcd.us-east-1.rds.amazonaws.com:3306") + parser.add_argument("-u", "--jdbc-username", required=False, help="Hive metastore JDBC user name") + parser.add_argument("-p", "--jdbc-password", required=False, help="Hive metastore JDBC password") parser.add_argument("-d", "--database-prefix", required=False, help="Optional prefix for database names in Glue DataCatalog") parser.add_argument("-t", "--table-prefix", required=False, help="Optional prefix for table name in Glue DataCatalog") parser.add_argument("-o", "--output-path", required=False, help="Output path, either local directory or S3 path") parser.add_argument("-i", "--input_path", required=False, help="Input path, either local directory or S3 path") - + parser.add_argument("-f", "--config_file", required=False, help="yaml configuration file path to read migration arguments from.") options = get_options(parser, args) - if options["mode"] == FROM_METASTORE: + if options.get("config_file") is not None: + # parse yaml config file if provided + config_file_path = options["config_file"] + logger.info(f"config_file provided. Parsing arguments from {options["config_file"]}") + with open(config_file_path, 'r') as yaml_file_stream: + config_options = yaml.load(yaml_file_stream, Loader=yaml.FullLoader) + options = {**options, **config_options} + + if options.get("mode") is None: + raise AssertionError("--mode options is required: either from_metastore or to_metastore") + elif options["mode"] == FROM_METASTORE: validate_options_in_mode( - options=options, mode=FROM_METASTORE, required_options=["output_path"], not_allowed_options=["input_path"] + options=options, mode=FROM_METASTORE, + required_options=["jdbc_url", "jdbc_username", "jdbc_password", "output_path"], + not_allowed_options=["input_path"] ) elif options["mode"] == TO_METASTORE: validate_options_in_mode( - options=options, mode=TO_METASTORE, required_options=["input_path"], not_allowed_options=["output_path"] + options=options, mode=TO_METASTORE, + required_options=["jdbc_url", "jdbc_username", "jdbc_password", "input_path"], + not_allowed_options=["output_path"] ) else: raise AssertionError("unknown mode " + options["mode"]) @@ -1607,39 +1628,6 @@ def parse_arguments(args): return options -def parse_arguments_from_yaml_file(args): - """ - This function accepts the path to a config file - and extracts the needed arguments for the metastore migration - ---------- - Return: - Dictionary of config options - """ - parser = argparse.ArgumentParser(prog=args[0]) - parser.add_argument('-f', '--config_file', required=True, default='artifacts/config.yaml`', help='Provide yaml configuration file path to read migration arguments from. Default path: `artifacts/config.yaml`') - options = get_options(parser, args) - config_file_path = options['config_file'] - ## read the yaml file - with open(config_file_path, 'r') as yaml_file_stream: - config_options = yaml.load(yaml_file_stream) - - if config_options['mode'] == FROM_METASTORE: - validate_options_in_mode( - options=config_options, mode=FROM_METASTORE, - required_options=['output_path'], - not_allowed_options=['input_path'] - ) - elif config_options['mode'] == TO_METASTORE: - validate_options_in_mode( - options=config_options, mode=TO_METASTORE, - required_options=['input_path'], - not_allowed_options=['output_path'] - ) - else: - raise AssertionError('unknown mode ' + options['mode']) - - return config_options - def get_spark_env(): try: sc = SparkContext.getOrCreate() @@ -1767,10 +1755,7 @@ def validate_aws_regions(region): def main(): - # options = parse_arguments(sys.argv) - - ## This now reads options from path to config yaml file - options = parse_arguments_from_yaml_file(sys.argv) + options = parse_arguments(sys.argv) connection = {"url": options["jdbc_url"], "user": options["jdbc_username"], "password": options["jdbc_password"]} db_prefix = options.get("database_prefix") or "" From 85a98b603733dc4f91bad31d9e8de7efca56d01a Mon Sep 17 00:00:00 2001 From: Ryo Manabe Date: Wed, 28 May 2025 14:58:36 +0900 Subject: [PATCH 3/5] Changed the config file format to JSON so that hive_metastore_migration.py can be loaded from Glue 5.0 job, which doesn't have PyYAML by default --- utilities/Hive_metastore_migration/README.md | 37 +++++++++++++------ .../src/hive_metastore_migration.py | 13 +++---- 2 files changed, 32 insertions(+), 18 deletions(-) diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md index ae543d90..25b20828 100644 --- a/utilities/Hive_metastore_migration/README.md +++ b/utilities/Hive_metastore_migration/README.md @@ -206,18 +206,22 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. - `--database-prefix` and `--table-prefix` (optional) to set a string prefix that is applied to the database and table names. They are empty by default. - - Optionally, you can set `--config_file` to `` which contains configuration parameters. - - Provide the following configuration parameters in the configuration yaml file: - * mode: - * jdbc_url: - * jdbc_username: - * jdbc_password: - * database_prefix: - * table_prefix: - * output_path: + - Optionally, you can set `--config_file` to `` which contains the configuration parameters. + - Provide the following configuration parameters in the configuration json file: + ```json + { + "mode": "from-metastore", + "jdbc_url": "JDBC URL", + "jdbc_username": "JDBC username", + "jdbc_password": "JDBC password", + "database_prefix": "Database prefix", + "table_prefix": "Table prefix", + "output_path": "Output local or s3 path" + } + ``` - Example spark-submit command to migrate Hive metastore to S3, tested on EMR-4.7.1: - ```bash + ```bash MYSQL_JAR_PATH=/usr/lib/hadoop/mysql-connector-java-5.1.42-bin.jar DRIVER_CLASSPATH=/home/hadoop/*:/etc/hadoop/conf:/etc/hive/conf:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:$MYSQL_JAR_PATH spark-submit --driver-class-path $DRIVER_CLASSPATH \ @@ -370,7 +374,7 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. 3. Submit the `hive_metastore_migration.py` Spark script to your Spark cluster. - - Set `--direction` to `to_metastore`. + - Set `--mode` to `to_metastore`. - Provide the JDBC connection information through the arguments: `--jdbc-url`, `--jdbc-username`, and `--jdbc-password`. - The argument `--input-path` is required. This can be a local directory or @@ -392,6 +396,17 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. s3://gluemigrationbucket/export_output// + - Optionally, you can set `--config_file` to `` which contains the configuration parameters. + - Provide the following configuration parameters in the configuration json file: + ```json + { + "mode": "to-metastore", + "jdbc_url": "JDBC URL", + "jdbc_username": "JDBC username", + "jdbc_password": "JDBC password", + "input_path": "Input local or S3 path" + } + ``` #### AWS Glue Data Catalog to another AWS Glue Data Catalog diff --git a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py index 3115a6ed..cc36e800 100644 --- a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py +++ b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py @@ -9,7 +9,6 @@ # except for python 2.7 standard library and Spark 2.1 import sys from datetime import datetime, timedelta, tzinfo -import yaml from time import localtime, strftime from types import MethodType @@ -1583,7 +1582,7 @@ def get_options(parser, args): def parse_arguments(args): """ parse arguments for the metastore migration. - If a yaml file is provided, it will override any parameters specified on the command line. + If a json file is provided, it will override any parameters specified on the command line. ---------- Return: Dictionary of config options @@ -1597,15 +1596,15 @@ def parse_arguments(args): parser.add_argument("-t", "--table-prefix", required=False, help="Optional prefix for table name in Glue DataCatalog") parser.add_argument("-o", "--output-path", required=False, help="Output path, either local directory or S3 path") parser.add_argument("-i", "--input_path", required=False, help="Input path, either local directory or S3 path") - parser.add_argument("-f", "--config_file", required=False, help="yaml configuration file path to read migration arguments from.") + parser.add_argument("-f", "--config_file", required=False, help="json configuration file path to read migration arguments from.") options = get_options(parser, args) if options.get("config_file") is not None: - # parse yaml config file if provided + # parse json config file if provided config_file_path = options["config_file"] - logger.info(f"config_file provided. Parsing arguments from {options["config_file"]}") - with open(config_file_path, 'r') as yaml_file_stream: - config_options = yaml.load(yaml_file_stream, Loader=yaml.FullLoader) + logger.info(f"config_file provided. Parsing arguments from {config_file_path}") + with open(config_file_path, 'r') as json_file_stream: + config_options = json.load(json_file_stream) options = {**options, **config_options} if options.get("mode") is None: From 6de768119ed5a640cdaa1dc0d8af488922d10b53 Mon Sep 17 00:00:00 2001 From: Ryo Manabe Date: Wed, 28 May 2025 18:05:15 +0900 Subject: [PATCH 4/5] Prioritize command line parameters over json config file for hive metastore migration --- utilities/Hive_metastore_migration/README.md | 2 +- .../src/hive_metastore_migration.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md index 25b20828..8f75eddd 100644 --- a/utilities/Hive_metastore_migration/README.md +++ b/utilities/Hive_metastore_migration/README.md @@ -404,7 +404,7 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. "jdbc_url": "JDBC URL", "jdbc_username": "JDBC username", "jdbc_password": "JDBC password", - "input_path": "Input local or S3 path" + "input_path": "Input local or S3 path" } ``` diff --git a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py index cc36e800..366cf58c 100644 --- a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py +++ b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py @@ -1582,7 +1582,7 @@ def get_options(parser, args): def parse_arguments(args): """ parse arguments for the metastore migration. - If a json file is provided, it will override any parameters specified on the command line. + If arguments are provided by both a json config file and command line, command line arguments will override any parameters specified on the json file. ---------- Return: Dictionary of config options @@ -1605,7 +1605,13 @@ def parse_arguments(args): logger.info(f"config_file provided. Parsing arguments from {config_file_path}") with open(config_file_path, 'r') as json_file_stream: config_options = json.load(json_file_stream) - options = {**options, **config_options} + + # merge options. command line options are prioritized. + for key in config_options: + if not options.get(key): + options[key] = config_options[key] + elif options[key] is None: + options[key] = config_options[key] if options.get("mode") is None: raise AssertionError("--mode options is required: either from_metastore or to_metastore") From c9c73be4f37244dfb973cfb2c2d59567c35139f9 Mon Sep 17 00:00:00 2001 From: Ryo Manabe Date: Mon, 2 Jun 2025 09:48:05 +0900 Subject: [PATCH 5/5] Add explanation what happens when same parameters are specified both in the config file and command line --- utilities/Hive_metastore_migration/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md index 8f75eddd..c4a502df 100644 --- a/utilities/Hive_metastore_migration/README.md +++ b/utilities/Hive_metastore_migration/README.md @@ -206,7 +206,7 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. - `--database-prefix` and `--table-prefix` (optional) to set a string prefix that is applied to the database and table names. They are empty by default. - - Optionally, you can set `--config_file` to `` which contains the configuration parameters. + - Optionally, you can set `--config_file` to `` which contains the configuration parameters. If same parameters are specified in both the configuration json file and the command line, the parameters specified on the command line will be used. - Provide the following configuration parameters in the configuration json file: ```json { @@ -396,7 +396,7 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. s3://gluemigrationbucket/export_output// - - Optionally, you can set `--config_file` to `` which contains the configuration parameters. + - Optionally, you can set `--config_file` to `` which contains the configuration parameters. If same parameters are specified in both the configuration json file and the command line, the parameters specified on the command line will be used. - Provide the following configuration parameters in the configuration json file: ```json {