Revert generate_anomalous_data as this was acidentally picked up.

chewys1024 · chewys1024 · commit ef6cc994bcaa · 2024-10-15T11:27:32.000-07:00
diff --git a/docker-init/generate_anomalous_data.py b/docker-init/generate_anomalous_data.py
@@ -1,23 +1,20 @@
 import random
 from datetime import datetime, timedelta
-
-import awswrangler as wr
-import boto3
-import botocore
-from decimal import Decimal
 import numpy as np
-import pandas as pd
 from pyspark.sql import SparkSession
 from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType, TimestampType, BooleanType
+import boto3
+import awswrangler as wr
+import os
 
 # Initialize Spark session
 spark = SparkSession.builder.appName("FraudClassificationSchema").getOrCreate()
 
-try:
-    dynamodb = boto3.client('dynamodb')
-except botocore.exceptions.NoCredentialsError:
-    print("AWS credentials not found. Please configure your AWS credentials.")
-    raise
+ENDPOINT_URL = os.environ.get("DYNAMO_ENDPOINT") if os.environ.get("DYNAMO_ENDPOINT") is not None else 'http://localhost:8000'
+
+wr.config.dynamodb_endpoint_url = ENDPOINT_URL
+dynamodb = boto3.client('dynamodb', endpoint_url=ENDPOINT_URL)
+
 
 def time_to_value(t, base_value, amplitude, noise_level, scale=1):
     if scale is None:
@@ -106,18 +103,18 @@ def generate_timeseries_with_anomalies(num_samples=1000, base_value=100, amplitu
 
     # User features (dim_user) – 7
 	StructField("user_account_age", IntegerType(), True),
-	StructField("user_account_balance", DoubleType(), True),
-	StructField("user_credit_score", IntegerType(), True),
-    StructField("user_number_of_devices", IntegerType(), True),
+	StructField("account_balance", DoubleType(), True),
+	StructField("credit_score", IntegerType(), True),
+    StructField("number_of_devices", IntegerType(), True),
     StructField("user_country", StringType(), True),
     StructField("user_account_type", IntegerType(), True),
     StructField("user_preferred_language", StringType(), True),
 
     # merchant features (dim_merchant) – 4
 	StructField("merchant_account_age", IntegerType(), True),
-	StructField("merchant_zipcode", IntegerType(), True),
+	StructField("zipcode", IntegerType(), True),
     # set to true for 100 merchant_ids
-    StructField("merchant_is_big_merchant", BooleanType(), True),
+    StructField("is_big_merchant", BooleanType(), True),
     StructField("merchant_country", StringType(), True),
     StructField("merchant_account_type", IntegerType(), True),      
     StructField("merchant_preferred_language", StringType(), True),
@@ -260,80 +257,8 @@ def generate_fraud_sample_data(num_samples=10000):
         data.append(tuple(row))
     return data
 
-fraud_data = generate_fraud_sample_data(5000)
+fraud_data = generate_fraud_sample_data(20000)
 fraud_df = spark.createDataFrame(fraud_data, schema=fraud_schema)
 
 fraud_df.write.mode("overwrite").parquet("data")
-print("Successfully wrote user data to parquet", flush=True)
-
-
-dynamodb = boto3.client('dynamodb')
-
-skip_data_upload = False
-table_name = "transaction_risk"
-try:
-    dynamodb.create_table(
-        AttributeDefinitions=[
-            {
-                'AttributeName': 'user_id',
-                'AttributeType': 'N',
-            },
-            {
-                'AttributeName': 'ts',
-                'AttributeType': 'N',
-            },
-        ],
-        KeySchema=[
-            {
-                'AttributeName': 'user_id',
-                'KeyType': 'HASH'
-            },
-            {
-                'AttributeName': 'ts',
-                'KeyType': 'RANGE',
-            },
-        ],
-        ProvisionedThroughput={
-            'ReadCapacityUnits': 5,
-            'WriteCapacityUnits': 5,
-        },
-        TableName=table_name,
-    )
-    dynamodb.get_waiter('table_exists').wait(TableName=table_name)
-except botocore.exceptions.ClientError as error:
-    if error.response['Error']['Code'] == 'ResourceInUseException':
-        # Table already exists
-        print("Dynamo Table already exists. Skipping upload")
-        skip_data_upload = True
-    else:
-        raise error
-
-if not skip_data_upload:
-    print("Done creating table. Beginning data upload (This may take a few minutes)", flush=True)
-
-    fraud_df = fraud_df.toPandas()
-
-    # Convert Float types to Decimal
-    for column in fraud_df.columns:
-        if str(fraud_df.dtypes[column]) == "float64":
-            fraud_df[column] = fraud_df[column].apply(str).apply(Decimal)
-
-    # Convert datetime to string and int
-    fraud_df['ds'] = pd.to_datetime(fraud_df['transaction_time']).dt.date.apply(str)
-    fraud_df['ts'] = pd.to_datetime(fraud_df['transaction_time']).astype('int64')
-    fraud_df.drop('transaction_time', axis=1, inplace=True)
-
-    print("Uploading to DynamoDB")
-
-
- # Upload data in batches
-    batch_size = 1000  # Adjust based on your needs
-    for i in range(0, len(fraud_df), batch_size):
-        batch = fraud_df.iloc[i:i+batch_size]
-        wr.dynamodb.put_df(df=batch, table_name=table_name)
-        print(f"Uploaded batch {i//batch_size + 1}/{len(fraud_df)//batch_size + 1}", flush=True)
-
-    # wr.dynamodb.put_df(df=fraud_df, table_name=table_name)
-
-
-    print("Wrote parquet to Dynamo")
+print("Successfully wrote user data to parquet")