In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
#Load data
train_path = 'train.csv'
test_path = 'test.csv'
def load_data(path, drop_cols=None):
    if drop_cols:
        df = pd.read_csv(path, header=0)
        df = df.drop(drop_cols, axis=1)
        return df
    return pd.read_csv(path, header=0)

train = load_data(train_path, drop_cols=['ID'])
In [3]:
#helper function to keep the same columns for all data
#sometimes using get_dummies creates different columns and we 
#want to keep these uniform
def get_cols(X):
    df = pd.get_dummies(X)
    return list(df.columns)
In [4]:
from sklearn.model_selection import train_test_split

#splitting the data into train and test sets
train_set, test_set = train_test_split(train, test_size=0.2, random_state=42)
y_train = train_set[["y"]]
X_train = train_set.drop(["y"], axis=1)
y_test = test_set[["y"]]
X_test = test_set.drop(["y"], axis=1)
#getting columns found in both data sets:
train_cols = get_cols(X_train)
test_cols = get_cols(X_test)
columns = [x for x in train_cols if x in test_cols]