# first let's import some libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

titanic = pd.read_csv("https://marcopasi.github.io/physenbio_pyDAV/data/titanic_na.csv", index_col=0)

titanic.info()  # .info informs us on null (missing) data

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, Allen to Vestrom
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       9 non-null      object 
 1   age       8 non-null      float64
 2   fare      9 non-null      float64
 3   survived  10 non-null     int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 400.0+ bytes

titanic.isna()

titanic.dropna()

titanic.dropna(axis='columns')

titanic = pd.read_csv("https://marcopasi.github.io/physenbio_pyDAV/data/titanic.csv").set_index("name")

titanic1 = titanic.dropna()
len(titanic) - len(titanic1)

708

titanic2 = titanic.dropna(axis="columns")
print(len(titanic.columns) - len(titanic2.columns))
len(titanic2)

3

891

titanic.dropna(subset=["age"])

# rename column age to Age
titanic.rename(columns={"age": "Age"})

titanic.dropna().astype({"age": int, "survived": bool, "sex": "category"})

titanic = pd.read_csv("https://marcopasi.github.io/physenbio_pyDAV/data/titanic.csv").set_index("name")

titanic = titanic \
    .rename(columns={"pclass": "ticket_class"}) \
    .dropna(subset=["age"]) \
    .astype({"age": int, "survived": bool, "ticket_class": "category"})
titanic.head()

titanic = pd.read_csv("https://marcopasi.github.io/physenbio_pyDAV/data/titanic.csv").set_index("name")
titanic.head()
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, Braund, Mr. Owen Harris to Dooley, Mr. Patrick
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerId  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   sex          891 non-null    object 
 4   age          714 non-null    float64
 5   sibsp        891 non-null    int64  
 6   parch        891 non-null    int64  
 7   ticket       891 non-null    object 
 8   fare         891 non-null    float64
 9   cabin        204 non-null    object 
 10  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(4)
memory usage: 83.5+ KB

titanic = titanic\
    .dropna(subset=["age", "embarked"]) \
    .rename(columns={"pclass": "ticket_class"})\
    .astype({"survived": bool, "ticket_class": "category", 
             "age": int, "sex": "category", "embarked": "category"})

titanic.describe()

titanic.describe(include="all")

# Categorical : countplot
plt.subplot(2,2,1)
sns.countplot(x="survived", data=titanic)
plt.subplot(2,2,2)
sns.countplot(x="ticket_class", data=titanic)
plt.subplot(2,2,3)
sns.countplot(x="sibsp", data=titanic)
plt.subplot(2,2,4)
sns.countplot(x="parch", data=titanic)
plt.tight_layout()

# Continuous : histplot
plt.figure(figsize=(10,4))
plt.subplot(2,2,1)
sns.histplot(x="age", data=titanic)
plt.subplot(2,2,2)
sns.histplot(x="sibsp", data=titanic)
plt.subplot(2,2,3)
sns.histplot(x="parch", data=titanic)
plt.subplot(2,2,4)
sns.histplot(x="fare", data=titanic, kde=True)
plt.tight_layout()

sns.heatmap(titanic.corr(method='spearman'), annot=True)

<AxesSubplot:>

# Continuous vs categorical : boxplot (or violinplot)
# ... 2 examples:
sns.boxplot(x="survived", y="fare", data=titanic, hue="ticket_class")
plt.figure()
sns.violinplot(x="survived", y="fare", data=titanic, hue="sex", split=True, cut=0)

<Axes: xlabel='survived', ylabel='fare'>

# Continuous vs continuous : jointplot
# ... 2 examples, using the only 2 continuous variables in this dataset
sns.jointplot(x="age", y="fare", data=titanic, hue="sex")
plt.figure()
sns.jointplot(x="age", y="fare", data=titanic, kind="hex")
# The kind=hex option allows us to see where most points are located (2D histogram)

<seaborn.axisgrid.JointGrid at 0x7fa592f84e20>

<Figure size 640x480 with 0 Axes>

# Categorical vs categorical : countplots using x= and hue=
# ... 2 examples
sns.countplot(x="survived", hue="sex", data=titanic)
plt.figure()
sns.countplot(x="ticket_class", hue="sex", data=titanic)

<AxesSubplot:xlabel='ticket_class', ylabel='count'>

# Extend the age vs fare scatterplot to consider also sex, survived and pclass : 5 variables in total !!!
sns.relplot(x="age", y="fare", data=titanic, hue="sex", row='survived', col='ticket_class')

<seaborn.axisgrid.FacetGrid at 0x7f8b8f422f70>

# Extend the survived vs fare boxplot to consider also sex and pclass
sns.catplot(x="survived", y="fare", data=titanic, hue="sex", kind='box', col='ticket_class')

<seaborn.axisgrid.FacetGrid at 0x7f8b8f5dee50>

# Extend the survived countplot to consider also sex and pclass
sns.catplot(x="survived", data=titanic, hue="sex", kind='count', col='ticket_class')

<seaborn.axisgrid.FacetGrid at 0x7f8bad127160>

from scipy import stats

t, p_value = stats.ttest_1samp(titanic.loc[:, "age"], 42.4)

t, p_value = stats.ttest_ind(titanic.set_index("survived").loc[False, "age"], 
                             titanic.set_index("survived").loc[True, "age"])

titanic.groupby("survived").var().loc[:, "age"]

survived
False    200.664832
True     221.781601
Name: age, dtype: float64

titanic.groupby("sex").var().loc[:, "age"]

sex
female    196.056658
male      215.736213
Name: age, dtype: float64

t, p_value = stats.ttest_ind(titanic.set_index("sex").loc["male", "age"], 
                             titanic.set_index("sex").loc["female", "age"])

u, p_value = stats.mannwhitneyu(titanic.set_index("survived").loc[False, "fare"], 
                                titanic.set_index("survived").loc[True, "fare"])

titanic.groupby("ticket_class").var().loc[:, "age"]

ticket_class
1    218.755405
2    196.763880
3    156.028997
Name: age, dtype: float64

f, p_value = stats.f_oneway(titanic.set_index("ticket_class").loc[1, "age"],
                            titanic.set_index("ticket_class").loc[2, "age"],
                            titanic.set_index("ticket_class").loc[3, "age"])

titanic.groupby("ticket_class").var().loc[:, "fare"]

ticket_class
1    6608.637062
2     173.908290
3     100.865030
Name: fare, dtype: float64

h, p_value = stats.kruskal(titanic.set_index("ticket_class").loc[1, "fare"],
                           titanic.set_index("ticket_class").loc[2, "fare"],
                           titanic.set_index("ticket_class").loc[3, "fare"])

r, p_value = stats.spearmanr(titanic.loc[:, "fare"], 
                             titanic.loc[:, "age"])

chi2, p_value = stats.chisquare(titanic.loc[:, "sex"].value_counts())

contingency = pd.crosstab(titanic.loc[:, "sex"], titanic.loc[:, "survived"])
contingency

chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
# odds_ratio, p_value = stats.fisher_exact(contingency)

contingency = pd.crosstab(titanic.loc[:, "ticket_class"], titanic.loc[:, "survived"])

chi2, p_value, dof, expected = stats.chi2_contingency(contingency)

tips = pd.read_csv("https://marcopasi.github.io/physenbio_pyDAV/data/tips.csv")
_tips = tips

tips.head()

tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB

tips = tips.astype({"sex":"category", "smoker":"category", "day":"category", "time":"category"})
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB

tips2.describe()
tips2.describe(include=['category'])

# univariate
plt.subplot(2,2,1)
sns.countplot(x="sex", data=tips)
plt.subplot(2,2,2)
sns.countplot(x="smoker", data=tips)
plt.subplot(2,2,3)
sns.countplot(x="day", data=tips)
plt.subplot(2,2,4)
sns.countplot(x="time", data=tips)
plt.tight_layout()

plt.figure(figsize=(10,6))
plt.subplot(2,2,1)
sns.histplot(x="size", data=tips)
plt.subplot(2,2,2)
sns.histplot(x="total_bill", data=tips, kde=True)
plt.subplot(2,1,2)
sns.histplot(x="tip", data=tips, kde=True)
plt.tight_layout()

# ...

	sex	age	fare	survived
name
Braund	male	22.0	7.25	0
Futrelle	male	37.0	53.10	0
Heikkinen	female	26.0	7.92	1
Jussila	female	20.0	9.82	0
Madsen	male	24.0	7.14	1
Sloper	male	28.0	35.50	1
Vestrom	female	14.0	7.85	0

	survived
name
Allen	0
Braund	0
Cumings	1
Futrelle	1
Futrelle	0
Heikkinen	1
Jussila	0
Madsen	1
Sloper	1
Vestrom	0

	sex	age	fare	survived
name
Braund	male	22.0	7.25	0
Cumings	NaN	38.0	71.28	1
Futrelle	male	37.0	53.10	0
Heikkinen	female	26.0	7.92	1
Jussila	female	20.0	9.82	0
Madsen	male	24.0	7.14	1
Sloper	male	28.0	35.50	1
Vestrom	female	14.0	7.85	0

	sex	Age	fare	survived
name
Allen	male	NaN	8.05	0
Braund	male	22.0	7.25	0
Cumings	NaN	38.0	71.28	1
Futrelle	female	NaN	NaN	1
Futrelle	male	37.0	53.10	0
Heikkinen	female	26.0	7.92	1
Jussila	female	20.0	9.82	0
Madsen	male	24.0	7.14	1
Sloper	male	28.0	35.50	1
Vestrom	female	14.0	7.85	0

	passengerId	age	sibsp	parch	fare
count	712.000000	712.000000	712.000000	712.000000	712.000000
mean	448.589888	29.622191	0.514045	0.432584	34.567251
std	258.683191	14.502891	0.930692	0.854181	52.938648
min	1.000000	0.000000	0.000000	0.000000	0.000000
25%	222.750000	20.000000	0.000000	0.000000	8.050000
50%	445.000000	28.000000	0.000000	0.000000	15.645850
75%	677.250000	38.000000	1.000000	1.000000	33.000000
max	891.000000	80.000000	5.000000	6.000000	512.329200

Session 5 : Data Exploration¶

Exercises¶

Recap: visualisation¶

More pandas¶

Working with missing data¶

Exercise 1¶

Renaming columns¶

Changing column types¶

Exercise 2¶

Data Exploration¶

Initial exploration¶

Monovariate analysis¶

Bivariate analysis¶

Multivariate analysis¶

Relplots¶

Catplots¶

Hypothesis testing¶

Continuous vs categorical (binary): comparing two means¶

Exercise 3¶

Non-parametric tests for comparing two means¶

Continuous vs categorical: comparing several means¶

Exercise 4¶

Continuous vs continuous: correlation¶

Categorical vs categorical: comparing frequencies¶

Exercise 5¶

mini-Project¶

Tips¶

License¶

	sex	age	fare	survived
name
Allen	False	True	False	False
Braund	False	False	False	False
Cumings	True	False	False	False
Futrelle	False	True	True	False
Futrelle	False	False	False	False
Heikkinen	False	False	False	False
Jussila	False	False	False	False
Madsen	False	False	False	False
Sloper	False	False	False	False
Vestrom	False	False	False	False

	passengerId	survived	ticket_class	sex	age	sibsp	parch	ticket	fare	cabin	embarked
name
Braund, Mr. Owen Harris	1	False	3	male	22	1	0	A/5 21171	7.2500	NaN	S
Cumings, Mrs. John Bradley (Florence Briggs Thayer)	2	True	1	female	38	1	0	PC 17599	71.2833	C85	C
Heikkinen, Miss. Laina	3	True	3	female	26	0	0	STON/O2. 3101282	7.9250	NaN	S
Futrelle, Mrs. Jacques Heath (Lily May Peel)	4	True	1	female	35	1	0	113803	53.1000	C123	S
Allen, Mr. William Henry	5	False	3	male	35	0	0	373450	8.0500	NaN	S

	passengerId	survived	ticket_class	sex	age	sibsp	parch	ticket	fare	cabin	embarked
count	712.000000	712	712.0	712	712.000000	712.000000	712.000000	712	712.000000	183	712
unique	NaN	2	3.0	2	NaN	NaN	NaN	541	NaN	133	3
top	NaN	False	3.0	male	NaN	NaN	NaN	347082	NaN	G6	S
freq	NaN	424	355.0	453	NaN	NaN	NaN	7	NaN	4	554
mean	448.589888	NaN	NaN	NaN	29.622191	0.514045	0.432584	NaN	34.567251	NaN	NaN
std	258.683191	NaN	NaN	NaN	14.502891	0.930692	0.854181	NaN	52.938648	NaN	NaN
min	1.000000	NaN	NaN	NaN	0.000000	0.000000	0.000000	NaN	0.000000	NaN	NaN
25%	222.750000	NaN	NaN	NaN	20.000000	0.000000	0.000000	NaN	8.050000	NaN	NaN
50%	445.000000	NaN	NaN	NaN	28.000000	0.000000	0.000000	NaN	15.645850	NaN	NaN
75%	677.250000	NaN	NaN	NaN	38.000000	1.000000	1.000000	NaN	33.000000	NaN	NaN
max	891.000000	NaN	NaN	NaN	80.000000	5.000000	6.000000	NaN	512.329200	NaN	NaN

Variable	Definition	Key
total_bill	Total bill (cost of the meal), including tax, in US dollars
tip	Tip (gratuity) in US dollars
sex	Sex of person paying for the meal	Female, Male
smoker	Smoker in party?	Yes, No
day	Day of the week	Thu, Fri, Sat, Sun
time	Time of day	Lunch, Dinner
size	Size of the party

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4