Skip to content

Commit 1038f9a

Browse files
committed
Merge branch 'dt-lgb' of github.com:Neo9061/amazon-sagemaker-examples into dt-lgb
2 parents b66ca49 + 27a1f1f commit 1038f9a

File tree

22 files changed

+2736
-0
lines changed

22 files changed

+2736
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ These examples provide a gentle introduction to machine learning concepts as the
6262
- [Traffic violations forecasting using DeepAR](introduction_to_applying_machine_learning/deepar_chicago_traffic_violations) is an example to use daily traffic violation data to predict pattern and seasonality to use Amazon DeepAR alogorithm.
6363
- [Visual Inspection Automation with Pre-trained Amazon SageMaker Models](introduction_to_applying_machine_learning/visual_object_detection) is an example for fine-tuning pre-trained Amazon Sagemaker models on a target dataset.
6464
- [Create SageMaker Models Using the PyTorch Model Zoo](introduction_to_applying_machine_learning/sagemaker_pytorch_model_zoo) contains an example notebook to create a SageMaker model leveraging the PyTorch Model Zoo and visualize the results.
65+
- [Fraud Detection Using Graph Neural Networks](introduction_to_applying_machine_learning/fraud_detection_using_graph_neural_networks) is an example to identify fraudulent transactions from transaction and user identity datasets.
6566
- [Identify key insights from textual document](introduction_to_applying_machine_learning/identify_key_insights_from_textual_document) contains comphrensive notebooks for five natural language processing tasks Document Summarization, Text Classification, Question and Answering, Name Entity Recognition, and Semantic Relation Extracion.
6667
- [Synthetic Churn Prediction with Text](introduction_to_applying_machine_learning/synthetic_churn_prediction_with_text) contains an example notebook to train, deploy and use a churn prediction model that processed numerical, categorical and textual features to make its prediction.
6768

introduction_to_applying_machine_learning/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ These examples provide a gentle introduction to machine learning concepts as the
1717
- [Traffic violations forecasting using DeepAR](deepar_chicago_traffic_violations) is an example to use daily traffic violation data to predict pattern and seasonality to use Amazon DeepAR alogorithm.
1818
- [Visual Inspection Automation with Pre-trained Amazon SageMaker Models](visual_object_detection) is an example for fine-tuning pre-trained Amazon Sagemaker models on a target dataset.
1919
- [Create SageMaker Models Using the PyTorch Model Zoo](sagemaker_pytorch_model_zoo) contains an example notebook to create a SageMaker model leveraging the PyTorch Model Zoo and visualize the results.
20+
- [Fraud Detection Using Graph Neural Networks](fraud_detection_using_graph_neural_networks) is an example to identify fraudulent transactions from transaction and user identity datasets.
2021
- [Identify key insights from textual document](identify_key_insights_from_textual_document) contains comphrensive notebooks for five natural language processing tasks Document Summarization, Text Classification, Question and Answering, Name Entity Recognition, and Semantic Relation Extracion.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import os
2+
import pandas as pd
3+
4+
def get_data():
5+
data_prefix = "preprocessed-data/"
6+
7+
if not os.path.exists(data_prefix):
8+
print("""Expected the following folder {} to contain the preprocessed data.
9+
Run data processing first in main notebook before running baselines comparisons""".format(data_prefix))
10+
return
11+
12+
features = pd.read_csv(data_prefix + "features_xgboost.csv", header=None)
13+
labels = pd.read_csv(data_prefix + "tags.csv").set_index('TransactionID')
14+
valid_users = pd.read_csv(data_prefix + "validation.csv", header=None)
15+
test_users = pd.read_csv(data_prefix + "test.csv", header=None)
16+
17+
valid_X = features.merge(valid_users, on=[0], how='inner')
18+
test_X = features.merge(test_users, on=[0], how='inner')
19+
20+
train_index = ~((features[0].isin(test_users[0].values) | (features[0].isin(valid_users[0].values))))
21+
train_X = features[train_index]
22+
valid_y = labels.loc[valid_X[0]]
23+
test_y = labels.loc[test_X[0]]
24+
train_y = labels.loc[train_X[0]]
25+
26+
train_X.set_index([0], inplace=True)
27+
valid_X.set_index([0], inplace=True)
28+
test_X.set_index([0], inplace=True)
29+
30+
train_data = train_y.join(train_X) # first column is the label 'isFraud'
31+
valid_data = valid_y.join(valid_X)
32+
test_data = test_y.join(test_X)
33+
return train_data, valid_data, test_data
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
from faker import Faker
2+
import datetime
3+
import itertools
4+
import numpy as np
5+
import pandas as pd
6+
Faker.seed(0)
7+
np.random.seed(0)
8+
9+
NUM_UNIQUE_CCS = 40*10**3
10+
START_TRANS_DATE = datetime.datetime(2012, 1, 15)
11+
END_TRANS_DATE = datetime.datetime(2012, 3, 15)
12+
13+
def gen_fraud_data(num_unique_ccs=NUM_UNIQUE_CCS, start_trans_date=START_TRANS_DATE, end_trans_date=END_TRANS_DATE):
14+
fake = Faker()
15+
cc_nums = [fake.credit_card_number() for _ in range(num_unique_ccs)]
16+
cc_types = [fake.credit_card_provider()for _ in range(num_unique_ccs)]
17+
num_trans_per_cc = np.ceil(np.random.exponential(scale=3, size=num_unique_ccs)).astype(np.int32)
18+
cc_ipv4 = [fake.ipv4() for _ in range(num_unique_ccs)]
19+
cc_phone_number = [fake.phone_number()for _ in range(num_unique_ccs)]
20+
cc_device_id = [fake.msisdn()for _ in range(num_unique_ccs)]
21+
22+
data = {
23+
'TransactionID': [fake.uuid4() for _ in range(sum(num_trans_per_cc))],
24+
'TransactionDT': [fake.date_time_between_dates(datetime_start=start_trans_date, datetime_end=end_trans_date)
25+
for _ in range(sum(num_trans_per_cc))],
26+
'card_no': list(itertools.chain.from_iterable([[cc_num]*num_trans for cc_num, num_trans in zip(cc_nums, num_trans_per_cc)])),
27+
'card_type': list(itertools.chain.from_iterable([[card]*num_trans for card, num_trans in zip(cc_types, num_trans_per_cc)])),
28+
'email_domain': [fake.ascii_email().split("@")[1] for _ in range(sum(num_trans_per_cc))],
29+
'ProductCD': np.random.choice(['45', 'AB', 'L', 'Y', 'T'], size=sum(num_trans_per_cc)),
30+
'TransactionAmt': np.abs(np.ceil(np.random.exponential(scale=10, size=sum(num_trans_per_cc))*100)).astype(np.int32),
31+
}
32+
transactions = pd.DataFrame(data).sort_values(by=['TransactionDT'])
33+
34+
# if you want to make the # of observations in the identity table less than that in the transactions table which may be more realistic in a practical scenario, change the size argument below.
35+
identity_transactions_idx = np.random.choice(transactions.shape[0], size=int(transactions.shape[0]*1.0), replace=False)
36+
id_data = {
37+
'IpAddress': list(itertools.chain.from_iterable([[ipv4]*num_trans for ipv4, num_trans in zip(cc_ipv4, num_trans_per_cc)])),
38+
'PhoneNo' : list(itertools.chain.from_iterable([[phone_num]*num_trans for phone_num, num_trans in zip(cc_phone_number, num_trans_per_cc)])),
39+
'DeviceID': list(itertools.chain.from_iterable([[device_id]*num_trans for device_id, num_trans in zip(cc_device_id, num_trans_per_cc)])),
40+
}
41+
identity = pd.DataFrame(id_data)
42+
identity["TransactionID"] = transactions.TransactionID
43+
assert identity.shape[0] == transactions.shape[0]
44+
45+
identity = identity.loc[identity_transactions_idx]
46+
identity.reset_index(drop=True, inplace=True)
47+
identity = identity[["TransactionID", "IpAddress", "PhoneNo", "DeviceID"]]
48+
identity = pd.DataFrame(id_data)
49+
50+
51+
# join two tables for the convenience of generating label column 'isFraud'
52+
full_two_df = transactions[["TransactionID", "card_no", "card_type", "email_domain", "ProductCD", "TransactionAmt"]].merge(identity, on='TransactionID', how='left')
53+
54+
is_fraud = []
55+
for idx, row in full_two_df.iterrows():
56+
card_no, card_type, email, product_type, transcation_amount, ip_address, phone_no, device_id = str(row["card_no"]), row["card_type"], row["email_domain"], row["ProductCD"], row["TransactionAmt"], str(row["IpAddress"]), str(row["PhoneNo"]), str(row["DeviceID"])
57+
58+
if email in ["hotmail.com", "gmail.com", "yahoo.com"]:
59+
if product_type in ["45"]:
60+
is_fraud.append(int(np.random.uniform() < 0.9))
61+
else:
62+
if (device_id != "nan") and (device_id.endswith("16") or device_id.endswith("78") or device_id.endswith("23")):
63+
is_fraud.append(int(np.random.uniform() < 0.1))
64+
else:
65+
is_fraud.append(int(np.random.uniform() < 0.05))
66+
else:
67+
if transcation_amount > 3000:
68+
is_fraud.append(int(np.random.uniform() < 0.8))
69+
else:
70+
if card_type in ["Diners Club / Carte Blanche", "JCB 15 digit", "Maestro"]: # about 35,000 observations are in this categires
71+
if (card_no.endswith("001") or card_no.endswith("002") or card_no.endswith("003") or card_no.endswith("004") or card_no.endswith("005") or card_no.endswith("007") or card_no.endswith("008") or card_no.endswith("009")) or ((phone_no != "nan") and (phone_no.endswith(".227") or phone_no.endswith(".104") or phone_no.endswith(".251") or phone_no.endswith(".181"))):
72+
is_fraud.append(int(np.random.uniform() < 0.3))
73+
else:
74+
if (ip_address != "nan") and (ip_address.endswith(".227") or ip_address.endswith(".104") or ip_address.endswith(".251") or ip_address.endswith(".181")):
75+
is_fraud.append(int(np.random.uniform() < 0.2))
76+
else:
77+
is_fraud.append(int(np.random.uniform() < 0.1))
78+
else:
79+
is_fraud.append(int(np.random.uniform() < 0.0001))
80+
print("fraud ratio", sum(is_fraud)/ len(is_fraud))
81+
82+
transactions['isFraud'] = is_fraud
83+
return transactions, identity
84+
85+
if __name__ == '__main__':
86+
transaction, identity = gen_fraud_data()
87+
transaction.to_csv('raw_data/transaction.csv', index=False)
88+
identity.to_csv('raw_data/identity.csv', index=False)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
import argparse
2+
import logging
3+
import os
4+
5+
import pandas as pd
6+
import numpy as np
7+
from itertools import combinations
8+
9+
10+
def parse_args():
11+
parser = argparse.ArgumentParser()
12+
parser.add_argument('--data-dir', type=str, default='/opt/ml/processing/input')
13+
parser.add_argument('--output-dir', type=str, default='/opt/ml/processing/output')
14+
parser.add_argument('--transactions', type=str, default='transaction.csv', help='name of file with transactions')
15+
parser.add_argument('--identity', type=str, default='identity.csv', help='name of file with identity info')
16+
parser.add_argument('--id-cols', type=str, default='', help='comma separated id cols in transactions table')
17+
parser.add_argument('--cat-cols', type=str, default='', help='comma separated categorical cols in transactions')
18+
parser.add_argument('--cat-cols-xgboost', type=str, default='', help='comma separated categorical cols that can be used as features for xgboost in transactions')
19+
parser.add_argument('--train-data-ratio', type=float, default=0.7, help='fraction of data to use in training set')
20+
parser.add_argument('--valid-data-ratio', type=float, default=0.2, help='fraction of data to use in validation set')
21+
parser.add_argument('--construct-homogeneous', action="store_true", default=False,
22+
help='use bipartite graphs edgelists to construct homogenous graph edgelist')
23+
return parser.parse_args()
24+
25+
26+
def get_logger(name):
27+
logger = logging.getLogger(name)
28+
log_format = '%(asctime)s %(levelname)s %(name)s: %(message)s'
29+
logging.basicConfig(format=log_format, level=logging.INFO)
30+
logger.setLevel(logging.INFO)
31+
return logger
32+
33+
34+
def load_data(data_dir, transaction_data, identity_data, train_data_ratio, valid_data_ratio, output_dir):
35+
transaction_df = pd.read_csv(os.path.join(data_dir, transaction_data))
36+
logging.info("Shape of transaction data is {}".format(transaction_df.shape))
37+
logging.info("# Tagged transactions: {}".format(len(transaction_df) - transaction_df.isFraud.isnull().sum()))
38+
39+
identity_df = pd.read_csv(os.path.join(data_dir, identity_data))
40+
logging.info("Shape of identity data is {}".format(identity_df.shape))
41+
42+
# extract out transactions for train, validation, and test data
43+
logging.info("Training, validation, and test data fraction are {}, {}, and {}, respectively".format(train_data_ratio, valid_data_ratio, 1-train_data_ratio-valid_data_ratio))
44+
assert train_data_ratio + valid_data_ratio < 1, "The sum of training and validation ratio is found more than or equal to 1."
45+
n_train = int(transaction_df.shape[0]*train_data_ratio)
46+
n_valid = int(transaction_df.shape[0]*(train_data_ratio+valid_data_ratio))
47+
valid_ids = transaction_df.TransactionID.values[n_train:n_valid]
48+
test_ids = transaction_df.TransactionID.values[n_valid:]
49+
50+
get_fraud_frac = lambda series: 100 * sum(series)/len(series)
51+
logging.info("Percentage of fraud transactions for train data: {}".format(get_fraud_frac(transaction_df.isFraud[:n_train])))
52+
logging.info("Percentage of fraud transactions for validation data: {}".format(get_fraud_frac(transaction_df.isFraud[n_train:n_valid])))
53+
logging.info("Percentage of fraud transactions for test data: {}".format(get_fraud_frac(transaction_df.isFraud[n_valid:])))
54+
logging.info("Percentage of fraud transactions for all data: {}".format(get_fraud_frac(transaction_df.isFraud)))
55+
56+
with open(os.path.join(output_dir, 'validation.csv'), 'w') as f:
57+
f.writelines(map(lambda x: str(x) + "\n", valid_ids))
58+
logging.info("Wrote validaton data to file: {}".format(os.path.join(output_dir, 'validation.csv')))
59+
60+
with open(os.path.join(output_dir, 'test.csv'), 'w') as f:
61+
f.writelines(map(lambda x: str(x) + "\n", test_ids))
62+
logging.info("Wrote test data to file: {}".format(os.path.join(output_dir, 'test.csv')))
63+
64+
return transaction_df, identity_df, valid_ids, test_ids
65+
66+
67+
def get_features_and_labels(transactions_df, transactions_id_cols, transactions_cat_cols, transactions_cat_cols_xgboost, output_dir):
68+
# Get features
69+
non_feature_cols = ['isFraud', 'TransactionDT'] + transactions_id_cols.split(",")
70+
feature_cols = [col for col in transactions_df.columns if col not in non_feature_cols]
71+
logging.info("Categorical columns: {}".format(transactions_cat_cols.split(",")))
72+
features = pd.get_dummies(transactions_df[feature_cols], columns=transactions_cat_cols.split(",")).fillna(0)
73+
features['TransactionAmt'] = features['TransactionAmt'].apply(np.log10)
74+
logging.info("Transformed feature columns: {}".format(list(features.columns)))
75+
logging.info("Shape of features: {}".format(features.shape))
76+
features.to_csv(os.path.join(output_dir, 'features.csv'), index=False, header=False)
77+
logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features.csv')))
78+
79+
80+
logging.info("Processing feature columns for XGBoost.")
81+
cat_cols_xgb = transactions_cat_cols_xgboost.split(",")
82+
logging.info("Categorical feature columns for XGBoost: {}".format(cat_cols_xgb))
83+
logging.info("Numerical feature column for XGBoost: 'TransactionAmt'")
84+
features_xgb = pd.get_dummies(transactions_df[['TransactionID']+cat_cols_xgb], columns=cat_cols_xgb).fillna(0)
85+
features_xgb['TransactionAmt'] = features['TransactionAmt']
86+
features_xgb.to_csv(os.path.join(output_dir, 'features_xgboost.csv'), index=False, header=False)
87+
logging.info("Wrote features to file: {}".format(os.path.join(output_dir, 'features_xgboost.csv')))
88+
89+
# Get labels
90+
transactions_df[['TransactionID', 'isFraud']].to_csv(os.path.join(output_dir, 'tags.csv'), index=False)
91+
logging.info("Wrote labels to file: {}".format(os.path.join(output_dir, 'tags.csv')))
92+
93+
94+
def get_relations_and_edgelist(transactions_df, identity_df, transactions_id_cols, output_dir):
95+
# Get relations
96+
edge_types = transactions_id_cols.split(",") + list(identity_df.columns)
97+
logging.info("Found the following distinct relation types: {}".format(edge_types))
98+
id_cols = ['TransactionID'] + transactions_id_cols.split(",")
99+
full_identity_df = transactions_df[id_cols].merge(identity_df, on='TransactionID', how='left')
100+
logging.info("Shape of identity columns: {}".format(full_identity_df.shape))
101+
102+
# extract edges
103+
edges = {}
104+
for etype in edge_types:
105+
edgelist = full_identity_df[['TransactionID', etype]].dropna()
106+
edgelist.to_csv(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype), index=False, header=True)
107+
logging.info("Wrote edgelist to: {}".format(os.path.join(output_dir, 'relation_{}_edgelist.csv').format(etype)))
108+
edges[etype] = edgelist
109+
return edges
110+
111+
112+
def create_homogeneous_edgelist(edges, output_dir):
113+
homogeneous_edges = []
114+
for etype, relations in edges.items():
115+
for edge_relation, frame in relations.groupby(etype):
116+
new_edges = [(a, b) for (a, b) in combinations(frame.TransactionID.values, 2)
117+
if (a, b) not in homogeneous_edges and (b, a) not in homogeneous_edges]
118+
homogeneous_edges.extend(new_edges)
119+
120+
with open(os.path.join(output_dir, 'homogeneous_edgelist.csv'), 'w') as f:
121+
f.writelines(map(lambda x: "{}, {}\n".format(x[0], x[1]), homogeneous_edges))
122+
logging.info("Wrote homogeneous edgelist to file: {}".format(os.path.join(output_dir, 'homogeneous_edgelist.csv')))
123+
124+
125+
if __name__ == '__main__':
126+
logging = get_logger(__name__)
127+
128+
args = parse_args()
129+
130+
transactions, identity, _, _ = load_data(args.data_dir,
131+
args.transactions,
132+
args.identity,
133+
args.train_data_ratio,
134+
args.valid_data_ratio,
135+
args.output_dir)
136+
137+
get_features_and_labels(transactions, args.id_cols, args.cat_cols, args.cat_cols_xgboost, args.output_dir)
138+
relational_edges = get_relations_and_edgelist(transactions, identity, args.id_cols, args.output_dir)
139+
140+
if args.construct_homogeneous:
141+
create_homogeneous_edgelist(relational_edges, args.output_dir)
142+
143+
144+

0 commit comments

Comments
 (0)