From 7de54dad6b72fbe3b2ef55017223d8fd8762d8c7 Mon Sep 17 00:00:00 2001 From: meng teng Date: Tue, 9 Oct 2018 00:13:48 -0400 Subject: [PATCH 1/2] Preprocessing of geoNetwork. --- dataset.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/dataset.py b/dataset.py index a9243c0..0b507e2 100644 --- a/dataset.py +++ b/dataset.py @@ -5,6 +5,7 @@ import numpy as np import json import argparse +from sklearn import preprocessing _DATA_DIR = './data' _TRAIN = 'train.csv' @@ -85,6 +86,7 @@ def preprocess(self, do_val_split=False): # Preprocessing operations go here. df['log_sum_revenue'] = self._make_log_sum_revenue() + df['geoNetwork_networkDomain'] = self._convert_geoNetwork_domain() return df @@ -107,15 +109,38 @@ def _make_log_sum_revenue(self): train_revenue_log_sum = (train_revenue_sum + 1).apply(np.log) return train_revenue_log_sum - - + def _make_json_converter(self, column_name): """Helper function to interpret columns in PANDAS.""" return lambda x: {column_name: json.loads(x)} - - + def _convert_geoNetwork_domain(self): + """Ont hot encode domain, location, region, subContinent in geoNetwork. + Missing value automatically imputed by one hot encoder. + Standardize using normal distribution. + Group by fullVisitorID. + + Returns: + A DataFrame containing preprocessed geoNetwork Data with one hot encoding. + """ + train_df = self.train.copy(deep=False) + to_encode = ['networkDomain', 'networkLocation', 'region', 'subContinent'] + results = pd.DataFrame() + for index, row in train_df.iterrows(): + for item in to_encode: + individual_key = 'geoNetwork.' + item + row[individual_key] = row['geoNetwork']['geoNetwork'][item] + for item in to_encode: + individual_key = 'geoNetwork.' + item + encoded = pd.get_dummies(train_df[individual_key], prefix=individual_key) + results = pd.concat([results, encoded], axis=1) + columns = results.columns + results[columns] = preprocessing.scale(results[columns]) + results[columns] = preprocessing.normalize(results[columns], norm='l2') + + return results + if __name__ == '__main__': parser = argparse.ArgumentParser( @@ -125,6 +150,7 @@ def _make_json_converter(self, column_name): args = parser.parse_args() # Make sure we can load the dataset + args.debug = True dataset = Dataset(debug=args.debug) # Sanity check, make sure we have the right number of rows @@ -138,3 +164,4 @@ def _make_json_converter(self, column_name): assert num_test == _NUM_ROWS_TEST, 'Incorrect number of test examples found.' print('Successfully loaded the dataset.') + dataset.preprocess() From f7444929cfbde15c7d0beeff20f143a9f164e074 Mon Sep 17 00:00:00 2001 From: meng teng Date: Thu, 11 Oct 2018 15:24:36 -0400 Subject: [PATCH 2/2] Revise geoNetwork based on comments. --- dataset.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dataset.py b/dataset.py index 0b507e2..95b745b 100644 --- a/dataset.py +++ b/dataset.py @@ -86,7 +86,9 @@ def preprocess(self, do_val_split=False): # Preprocessing operations go here. df['log_sum_revenue'] = self._make_log_sum_revenue() - df['geoNetwork_networkDomain'] = self._convert_geoNetwork_domain() + tmp_geoNetwork = self._convert_geoNetwork_domain() + geoNetwork_columns = tmp_geoNetwork.columns + df[geoNetwork_columns] = tmp_geoNetwork[geoNetwork_columns] return df @@ -125,8 +127,9 @@ def _convert_geoNetwork_domain(self): A DataFrame containing preprocessed geoNetwork Data with one hot encoding. """ train_df = self.train.copy(deep=False) + train_df.set_index('fullVisitorId', inplace=True) to_encode = ['networkDomain', 'networkLocation', 'region', 'subContinent'] - results = pd.DataFrame() + results = pd.DataFrame(index=train_df.index.copy()) for index, row in train_df.iterrows(): for item in to_encode: individual_key = 'geoNetwork.' + item @@ -136,9 +139,10 @@ def _convert_geoNetwork_domain(self): encoded = pd.get_dummies(train_df[individual_key], prefix=individual_key) results = pd.concat([results, encoded], axis=1) columns = results.columns - results[columns] = preprocessing.scale(results[columns]) - results[columns] = preprocessing.normalize(results[columns], norm='l2') - + scaler = preprocessing.StandardScaler() + results[columns] = scaler.fit_transform(results[columns]) + results = results.groupby(results.index).agg('mean') + results[columns] = preprocessing.normalize(results[columns].values.astype(float), norm='l2') return results @@ -150,7 +154,6 @@ def _convert_geoNetwork_domain(self): args = parser.parse_args() # Make sure we can load the dataset - args.debug = True dataset = Dataset(debug=args.debug) # Sanity check, make sure we have the right number of rows @@ -164,4 +167,3 @@ def _convert_geoNetwork_domain(self): assert num_test == _NUM_ROWS_TEST, 'Incorrect number of test examples found.' print('Successfully loaded the dataset.') - dataset.preprocess()