Add GNN encoder and xgb outputs for finance end-to-end example (#2970)

* Readme notebook polish and cleanup * Reorganize folder structure and initial gnn * Complete the graph generate step with edgemap output * Format fix * Format fix * Add graph construction and training notebooks * Add full gnn functionality * Update wording for readme --------- Co-authored-by: Chester Chen <[email protected]>
NVIDIA · Oct 9, 2024 · 595acb2 · 595acb2
1 parent d2d1f96
commit 595acb2
Show file tree

Hide file tree

Showing 30 changed files with 2,103 additions and 1,961 deletions.
diff --git a/examples/advanced/finance-end-to-end/README.md b/examples/advanced/finance-end-to-end/README.md
diff --git a/examples/advanced/finance-end-to-end/feature_enrichment.ipynb b/examples/advanced/finance-end-to-end/feature_enrichment.ipynb
diff --git a/examples/advanced/finance-end-to-end/figures/edge_map.png b/examples/advanced/finance-end-to-end/figures/edge_map.png
diff --git a/examples/advanced/finance-end-to-end/figures/embeddings.png b/examples/advanced/finance-end-to-end/figures/embeddings.png
diff --git a/examples/advanced/finance-end-to-end/figures/enrichment.png b/examples/advanced/finance-end-to-end/figures/enrichment.png
diff --git a/examples/advanced/finance-end-to-end/figures/generated_data.png b/examples/advanced/finance-end-to-end/figures/generated_data.png
diff --git a/examples/advanced/finance-end-to-end/figures/split_data.png b/examples/advanced/finance-end-to-end/figures/split_data.png
diff --git a/examples/advanced/finance-end-to-end/images/enrichment.png b/examples/advanced/finance-end-to-end/images/enrichment.png
diff --git a/examples/advanced/finance-end-to-end/images/generated_data.png b/examples/advanced/finance-end-to-end/images/generated_data.png
diff --git a/examples/advanced/finance-end-to-end/images/split_data.png b/examples/advanced/finance-end-to-end/images/split_data.png
diff --git a/examples/advanced/finance-end-to-end/notebooks/feature_enrichment.ipynb b/examples/advanced/finance-end-to-end/notebooks/feature_enrichment.ipynb
@@ -0,0 +1,319 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e6d10159-9c02-4bdd-ad6a-f9b7e19ac575",
+   "metadata": {},
+   "source": [
+    "## Feature Enrichment\n",
+    "\n",
+    "### Historical data enrichment\n",
+    "\n",
+    "Pick one client (Site, aka sender_BIC) to do the enrichment as every site will be the same process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7130bd7a-bda0-4592-818f-bd65c505baa3",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "site_input_dir = \"/tmp/dataset/horizontal_credit_fraud_data/\"\n",
+    "site_name = \"ZHSZUS33_Bank_1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9375ffaa-1143-43f5-b1a3-3ef45918e4bf",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import random\n",
+    "import string\n",
+    "\n",
+    "import pandas as pd\n",
+    "history_file_name = os.path.join(site_input_dir, site_name,\"history.csv\" )\n",
+    "df_history = pd.read_csv(history_file_name)\n",
+    "df_history"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fe8e513-f041-4165-88b1-3b21607ca734",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "history_summary = df_history.groupby('Currency').agg(\n",
+    "                     hist_trans_volume=('UETR', 'count'),\n",
+    "                     hist_total_amount=('Amount', 'sum'),\n",
+    "                     hist_average_amount=('Amount', 'mean')\n",
+    ").reset_index()\n",
+    "\n",
+    "history_summary"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "025ac920-c1c3-401f-b420-18c39b7d04d2",
+   "metadata": {},
+   "source": [
+    "# Enrich Feature with Currency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7aa07b6d-dc96-45e6-a467-8c770cafb84e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "dataset_names = [\"train\", \"test\"]\n",
+    "results = {}\n",
+    "\n",
+    "temp_ds_df = {}\n",
+    "temp_resampled_df = {}\n",
+    "\n",
+    "\n",
+    "for ds_name in dataset_names:\n",
+    "    file_name = os.path.join(site_input_dir, site_name , f\"{ds_name}.csv\" )\n",
+    "    ds_df  = pd.read_csv(file_name)\n",
+    "    ds_df['Time'] = pd.to_datetime(ds_df['Time'], unit='s')\n",
+    "\n",
+    "    # Set the Time column as the index\n",
+    "    ds_df.set_index('Time', inplace=True)\n",
+    "    \n",
+    "    resampled_df = ds_df.resample('1H').agg(\n",
+    "                     trans_volume=('UETR', 'count'),\n",
+    "                     total_amount=('Amount', 'sum'),\n",
+    "                     average_amount=('Amount', 'mean')\n",
+    "                     ).reset_index()\n",
+    "    \n",
+    "    temp_ds_df[ds_name] = ds_df\n",
+    "    temp_resampled_df[ds_name] = resampled_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2e86bc5-e8ad-41f5-b343-29595a378c03",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "for ds_name in dataset_names:\n",
+    "        \n",
+    "    ds_df = temp_ds_df[ds_name]\n",
+    "    resampled_df = temp_resampled_df[ds_name]\n",
+    "    \n",
+    "    c_df = ds_df[['Currency']].resample('1H').agg({'Currency': 'first'}).reset_index()\n",
+    "    # Add Currency_Country to the resampled data by joining with the original DataFrame\n",
+    "    resampled_df2 = pd.merge(resampled_df, \n",
+    "                            c_df,\n",
+    "                            on='Time'\n",
+    "                            )\n",
+    "    resampled_df3 = pd.merge(resampled_df2, \n",
+    "                             history_summary,\n",
+    "                             on='Currency'\n",
+    "                            )\n",
+    "    resampled_df4 = resampled_df3.copy()\n",
+    "    resampled_df4['x2_y1'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']\n",
+    "    \n",
+    "    ds_df = ds_df.sort_values('Time')\n",
+    "    resampled_df4 = resampled_df4.sort_values('Time')\n",
+    "    \n",
+    "    merged_df = pd.merge_asof(ds_df, resampled_df4, on='Time' )\n",
+    "    merged_df = merged_df.drop(columns=['Currency_y']).rename(columns={'Currency_x': 'Currency'})\n",
+    "    \n",
+    "    results[ds_name] = merged_df\n",
+    "    \n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7051468f-2de0-4e41-a227-7fad4c9110af",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Enrich feature for beneficiary country"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "605095b7-a514-4346-b984-3590d79d13e4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "history_summary2 = df_history.groupby('Beneficiary_BIC').agg(\n",
+    "                     hist_trans_volume=('UETR', 'count'),\n",
+    "                     hist_total_amount=('Amount', 'sum'),\n",
+    "                     hist_average_amount=('Amount', 'mean')\n",
+    ").reset_index()\n",
+    "\n",
+    "history_summary2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "edabd7be-4864-4964-9e25-df543d5985c6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "dataset_names = [\"train\", \"test\"]\n",
+    "results2 = {}\n",
+    "for ds_name in dataset_names:\n",
+    "    ds_df = temp_ds_df[ds_name]\n",
+    "    resampled_df = temp_resampled_df[ds_name]\n",
+    "    \n",
+    "    c_df = ds_df[['Beneficiary_BIC']].resample('1H').agg({'Beneficiary_BIC': 'first'}).reset_index()\n",
+    "    \n",
+    "    # Add Beneficiary_BIC to the resampled data by joining with the original DataFrame\n",
+    "    resampled_df2 = pd.merge(resampled_df, \n",
+    "                            c_df,\n",
+    "                            on='Time'\n",
+    "                            )\n",
+    "    \n",
+    "    resampled_df3 = pd.merge(resampled_df2, \n",
+    "                             history_summary2,\n",
+    "                             on='Beneficiary_BIC'\n",
+    "                            )\n",
+    "    \n",
+    "    \n",
+    "    resampled_df4 = resampled_df3.copy()\n",
+    "    resampled_df4['x3_y2'] = resampled_df4['average_amount']/resampled_df4['hist_trans_volume']\n",
+    "   \n",
+    "    ds_df = ds_df.sort_values('Time')\n",
+    "    resampled_df4 = resampled_df4.sort_values('Time')\n",
+    "\n",
+    "    merged_df2 = pd.merge_asof(ds_df, resampled_df4, on='Time' )\n",
+    "    merged_df2 = merged_df2.drop(columns=['Beneficiary_BIC_y']).rename(columns={'Beneficiary_BIC_x': 'Beneficiary_BIC'})\n",
+    "    \n",
+    "    results2[ds_name] = merged_df2\n",
+    "\n",
+    "print(results2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a44309a2-e252-458d-a9dc-2691aea9360f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "final_results = {}\n",
+    "for name in results:\n",
+    "    df = results[name]\n",
+    "    df2 = results2[name]\n",
+    "    df3 = df2[[\"Time\", \"Beneficiary_BIC\", \"x3_y2\"]].copy()\n",
+    "    df4 = pd.merge(df, df3, on=['Time', 'Beneficiary_BIC'])\n",
+    "    final_results[name] = df4\n",
+    "\n",
+    "    \n",
+    "for name in final_results:\n",
+    "    site_dir = os.path.join(site_input_dir, site_name)\n",
+    "    os.makedirs(site_dir, exist_ok=True)\n",
+    "    enrich_file_name = os.path.join(site_dir, f\"{name}_enrichment.csv\")\n",
+    "    print(enrich_file_name)\n",
+    "    final_results[name].to_csv(enrich_file_name) \n",
+    "    \n",
+    "final_results[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47c958c3-bf73-4ab3-a66f-414be10870ea",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "! tree {site_input_dir}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "791ba1db-0ccf-4b31-b838-828d8c6a98a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ls -al  /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eae3d95a-180a-4fb6-b006-1fc1c144c5c4",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "! find /tmp/dataset/horizontal_credit_fraud_data/ZHSZUS33_Bank_1/ -exec wc -l {} \\;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9966065-80cb-4f85-adab-8c44f01fc8d1",
+   "metadata": {},
+   "source": [
+    "Let's go back to the [XGBoost Notebook](../xgboost.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8855463-ce23-44e5-b0ad-4e05d256ba8d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nvflare_example",
+   "language": "python",
+   "name": "nvflare_example"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}