Titanic: Machine Learning from Disaster
Predict survival on the Titanic with Machine Learning
- Library
- Load the data
- Visualize the datasets
- Replace the embarked class to int categorical
- Modify the encoding of sex
- Create the category isAlone
- Create the column FamilySize
- Create the title feature
- Imputation
- Split the test and train data
- Using logistic regression
- Decision Tree
- Random Forest Classifier
- KNeighborsClassifier
- MLPCLassifier
- The best model: Random Forest
- Hyperparameter Tuning
- Random Forest
- Export the result
This was my first machine learning problem solved and for more information about this competition on Kaggle acess: https://www.kaggle.com/c/titanic.
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
import seaborn as sns
df_test = pd.read_csv('''C:/Users/lucas/Documents/lucastiagooliveira/Kaggle/Titanic - Machine Learning disaster/Datasets/test.csv''')
df_train = pd.read_csv('''C:/Users/lucas/Documents/lucastiagooliveira/Kaggle/Titanic - Machine Learning disaster/Datasets/train.csv''')
df_test_old = df_test.copy()
df_train_old = df_train.copy()
combine = [df_train, df_test]
print(df_train.head().info())
print('_'*40)
print(df_test.head().info())
df_train[['Sex', 'Survived']].groupby(by = 'Sex').mean()
- It's more likely the female survive
df_train[['Embarked', 'Survived']].groupby(by = 'Embarked').mean()
grid = sns.FacetGrid(df_train, row = 'Embarked', col = 'Survived')
grid.map(plt.hist, 'Age', alpha = .4)
grid = sns.FacetGrid(df_train, row = 'Embarked', col = 'Survived')
grid.map(sns.barplot, 'Sex', 'Fare', alpha = .4)
- we need to use for the model the embarked station and sex
df_train[['Pclass','Survived']].groupby(by = 'Pclass').mean()
sns.barplot(y = df_train['Survived'], x = df_train['Pclass'], data = df_train)
sns.barplot(y = df_train['Fare'], x = df_train['Pclass'], data = df_train)
df_train[['SibSp', 'Survived']].groupby(by = 'SibSp').mean()
df_train[['Parch', 'Survived']].groupby(by = 'Parch').mean()
- If the person is alone the have less likely to survive
- If the person have more than 4 parents in bord he likely died
df_train.loc[df_train['Parch'] == 0].loc[df_train['SibSp'] == 0]
for data in combine:
data['Embarked'] = data['Embarked'].map({'S': 0, 'Q': 1, 'C': 2})
df_train.head()
for data in combine:
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
df_train.head()
This category represents the person who was embarked alone
for data in combine:
isalone = [1 if (data['SibSp'][i] == 0 & data['Parch'][i] == 0) else 0 for i in range(0, data.shape[0])]
data['isAlone'] = isalone
df_train.corr()
for data in combine:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
df_train[['FamilySize','Survived']].groupby(by = 'FamilySize').mean().sort_values(by = 'Survived', ascending = False)
titles = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master': 4, 'Rare': 5}
for data in combine:
data['Title'] = data.Name.str.extract('([A-Za-z]+)\.', expand=False)
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr','Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle','Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
data['Title'] = data['Title'].map(titles)
data['Title'] = data['Title'].fillna(0)
for data in combine:
data = data.drop(columns = ['PassengerId','Name','SibSp','Parch','Cabin', 'Ticket'], inplace = True)
df_train.corr()
df_train.isnull().describe()
df_test.isnull().describe()
X = np.asarray(df_train.drop(columns = ['Survived']))
y = np.asarray(df_train[['Survived']])
y = np.ravel(y)
x_test = np.asarray(df_test)
# X = preprocessing.StandardScaler().fit_transform(X)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 3)
X = imputer.fit_transform(X)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors = 5)
x_test = imputer.fit_transform(x_test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = np.random)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
LR = LogisticRegression(C = 0.1, solver = 'newton-cg', random_state = np.random).fit(X_train,y_train)
LR
yhat_log = LR.predict(X_test)
print(classification_report(yhat_log, y_test))
log_score = LR.score(X_train, y_train)
from sklearn.tree import DecisionTreeClassifier
ctf = DecisionTreeClassifier(random_state = np.random, max_depth = 10, criterion = 'gini').fit(X_train,y_train)
ctf
yhat_ctf = ctf.predict(X_test)
print(classification_report(yhat_ctf, y_test))
tree_score = ctf.score(X_train,y_train)
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 1000).fit(X_train, y_train)
yhat_rfc = rfc.predict(X_test)
print(classification_report(yhat_rfc, y_test))
rfc_score = rfc.score(X_train, y_train)
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train)
yhat_knc = knc.predict(X_test)
print(classification_report(yhat_knc, y_test))
knc_score = knc.score(X_train, y_train)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter = 1000, tol = 1e-10, learning_rate = 'adaptive', alpha = 1e-5).fit(X_train, y_train)
yhat_mlp = mlp.predict(X_test)
print(classification_report(yhat_mlp, y_test))
mpl_score = mlp.score(X_train, y_train)
scores = pd.DataFrame({'Model': ['Logistic regression', 'Decision Tree', 'Random Forest', 'KNeighbors', 'MLPClissifier'],
'Score': [log_score, tree_score, rfc_score, knc_score, mpl_score]
}).set_index('Score').sort_values(by='Score', ascending = False)
scores
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators = 100, min_samples_leaf = 1, min_samples_split = 10)
error = cross_val_score(rf, X, y, cv = 50)
print('Scores:',error)
print('Mean', error.mean())
print('Standard Deviation:', error.std())
# from sklearn.model_selection import GridSearchCV
# par_grid = {'criterion':['gini', 'entropy'],
# 'max_depth':list(range(1,20,5)),
# 'min_samples_leaf':[1,5,10,15,25,50],
# 'min_samples_split':list(range(1,20,3)),
# 'n_estimators':[100,150,200,500,1000,1500]
# }
# rf = RandomForestClassifier(n_estimators = 200, oob_score = True, random_state = 1, n_jobs = -1,
# min_samples_leaf = 1, min_samples_split = 9)
# grid = GridSearchCV(estimator = rf, param_grid = par_grid, n_jobs = -1).fit(X, y)
# grid.bestparams
- The result of GridSearchCV fuction:
{'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 200}
rf_final = RandomForestClassifier(n_estimators = 200, oob_score = True, random_state = 42, n_jobs = -1,
min_samples_leaf = 1, min_samples_split = 9).fit(X, y)
yhat = rf_final.predict(x_test)
importances = pd.DataFrame({'Feature': df_train.drop(columns = ['Survived']).columns,
'Importance': rf_final.feature_importances_}).sort_values(by = 'Importance', ascending = False).set_index('Feature')
importances
dict_ = {'PassengerId': df_test_old['PassengerId'], 'Survived': yhat }
final_result = pd.DataFrame(dict_)
final_result.describe()
final_result.to_csv('results.csv',index=False)