import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from scipy import stats
np.random.seed(123)
from pydataset import data
from env import host, user, password
from wrangle_telco import wrangle_telco
from explore import correlation_exploration
$H_0$: There is no difference between smokers' tips and the overall population's tip average.
$H_a$: There is a difference between smokers' tips and the overall population's tip average.
Here is a simple, yet detailed explanation of this process if you need a little more.
"There are two possible outcomes; if the result confirms the hypothesis, then you've made a measurement. If the result is contrary to the hypothesis, then you've made a discovery." - Enrico Fermi
For a 95% confidence level, the value of alpha is .05 which means there is a 5% chance that you will make a Type I Error (False Positive) or reject a True Null hypothesis.
So What?
So What?
Now What?
One Sample T-test is when I compare the mean for a subgroup to the population mean.
Are sales for group A higher when we run a promotion?
Two Sample T-test is when we compare the mean of one subgroup to the mean of another subgroup.
# Set confidence level.
confidence_level = .95
# Set alpha.
alpha = 1 - confidence_level
A one-sample t-test compares the mean of a subgroup with the population mean.
subpop = array, list, Series
popmean = single value
t, p = scipy.stats.ttest_1samp(subpop, popmean)
A two-sample t-test compares the means of two subgroups.
subpop_a = array, list, or Series
subpop_b = array, list, or Series
t, p = scipy.stats.ttest_ind(subpop_a, subpop_b)
A one-tailed test looks for a specific difference: appropriate if I only want to determine if there is a difference between groups in a specific direction, positive only or negative only.
# Check one-tailed test for significance in positive direction. (greater than)
(p/2) < alpha
# Check one-tailed test for significance in negative direction. (less than)
(p/2) < alpha
A two-tailed test looks for any difference: appropriate if I want to test for significance without concern for a positive or negative direction.
# Check a two-tailed test for significance, non-directional.
p < alpha
# Create DataFrame from pydataset 'tips' dataset.
df = data('tips')
df.head()
# T-tests assume that the continous variable is normally distributed; quick check of this.
sns.distplot(df.tip)
plt.title('Distribution of Tips - Slight Right Skew')
plt.show()
df.tip
.print(f'The mean is: {df.tip.mean()}, and the median is {df.tip.median()}')
print('This is close enough to normal to continue.')
$H_0$: There is no difference between smokers' tips and the overall population's tip average.
$H_a$: There is a difference between smokers' tips and the overall population's tip average.
# Set Confidence Interval and Alpha; check alpha.
confidence_interval = .95
alpha = round(1 - confidence_interval, 2)
print(f'alpha = {alpha}')
# Create smokers subset of our df.
smokers = df[df.smoker == 'Yes']
# Assign the mean of all tips to the variable pop_mean.
pop_mean = df.tip.mean()
# Pass in the tip column from smokers subset and mean of tip column from entire df.
t, p = stats.ttest_1samp(smokers.tip, pop_mean)
print(f't = {t:.3f}')
print(f'p = {p:.3f}')
print(f'Our p-value is less than our alpha: {p < alpha}')
# T-tests assume that the continous variable is normally distributed; quick check of this.
sns.distplot(df.tip)
plt.title('Distribution of Tips - Slight Right Skew')
plt.show()
df.tip
.print(f'The mean is: {df.tip.mean()}, and the median is {df.tip.median()}')
print('This is close enough to normal to continue.')
$H_0$: There is no difference between women's and men's tips.
$H_a$: There is a difference between women's and men's tips.
# Set Confidence Interval and Alpha; check alpha.
confidence_interval = .95
alpha = round(1 - confidence_interval, 2)
print(f'alpha = {alpha}')
# Create subsets of males and females from our original df.
males = df[df.sex == 'Male']
females = df[df.sex == 'Female']
t, p = stats.ttest_ind(males.tip, females.tip)
print(f'The t-statistic for the two sample t-test comparing male to female tips is {round(t,3)}.')
print(f'Our p-value is {round(p, 3)}.')
print(f'This means there is about a {round(p * 100, 2)}% chance that we observe the data we have.')
print(f'Our p-value is less than our alpha: {p < alpha}')
Positive Correlation: both variables change in the same direction.
Neutral or No Correlation: No relationship in the change of the variables.
Negative Correlation: variables change in opposite directions.
We can use
r, p = stats.pearsonr(x,y)
to find r and p-values.
Keep in mind that...
Here you can see the guts of my function correlation_exploration that I created and imported to make this processes faster. I got tired of writing the same code over and over.
def correlation_exploration(df, x_string, y_string):
'''
This nifty function takes in a df, a string for x variable,
and a string for y variable and displays their correlation.
'''
r, p = stats.pearsonr(df[x_string], df[y_string])
df.plot.scatter(x_string, y_string)
plt.title(f"{x_string}'s Relationship with {y_string}")
print(f'The p-value is: {p}. There is {round(p,3)}% chance that we see these results by chance.')
print(f'r = {round(r, 2)}')
plt.show()
Here you can see the guts of my function to get my Telco data from the Codeup database and clean it up for use in some examples.
def wrangle_telco():
"""
Queries the telco_churn database
Returns a clean df with six columns:
customer_id(object), monthly_charges(float),
tenure(int), total_charges(float),
phone_service(object), internet_service_type_id(int)
"""
df = get_data_from_sql()
df.tenure.replace(0, 1, inplace=True)
df.total_charges.replace(' ', df.monthly_charges, inplace=True)
df.total_charges = df.total_charges.astype(float)
return df
telco = wrangle_telco()
telco.info()
telco.head()
$H_0$: There is no linear correlation between tenure and monthly charges.
$H_a$: There is a linear correlation between tenure and monthly charges.
# Set Confidence Interval and Alpha; check alpha.
confidence_interval = .95
alpha = round(1 - confidence_interval, 2)
print(f'alpha = {alpha}')
correlation_exploration(telco, 'tenure', 'monthly_charges')
$H_0$: There is no linear correlation between tenure and total charges.
$H_a$: There is a linear correlation between tenure and total charges.
correlation_exploration(telco, 'tenure', 'total_charges')
$H_0$: There is no linear correlation between tenure and monthly charges for customers who don't have phone but do have DSL services.
$H_a$: There is a linear correlation between tenure and monthly charges for customers who don't have phone but do have DSL services.
# Create a subset of customers without phone service but with internet.
# This ONLY includes DSL as I found that Fiber customers ALL have phone service.
no_phone_yes_dsl = telco[(telco.phone_service == 'No') & ((telco.internet_service_type_id == 1) | (telco.internet_service_type_id == 2))]
no_phone_yes_dsl.head()
# The relationship between tenure and monthly charges for customers that don't have phone service
# but DO have DSL
correlation_exploration(no_phone_yes_dsl, 'tenure', 'monthly_charges')
$H_0$: There is no linear correlation between tenure and monthly charges for customers with phone and Fiber services.
$H_a$: There is a linear correlation between tenure and monthly charges for customers with phone and Fiber services.
# Create subset of customers who have Fiber... all Fiber customers also have phone service
yes_phone_yes_fiber = telco[telco.internet_service_type_id == 2]
yes_phone_yes_fiber.head()
# the correlation of tenure and monthly charges for customers who have phone and Fiber service
correlation_exploration(yes_phone_yes_fiber, 'tenure', 'monthly_charges')
Were people with a higher ticket class on the Titanic more likely to survive?
$H_0$ = Survival rate is independent of ticket class.
$H_a$ = Survival rate is not independent of ticket class.
Does the type of service package a customer has affect the likelihood that she will churn?
$H_0$ = Churn is independent of type of service package.
$H_a$ = Churn is not independent of type of service package.
# ctab == observed values
ctab = pd.crosstab(df.Series, df.Series)
#expected == values we would expect to see if the variables are independent of each other.
chi2, p, degf, expected = stats.chi2_contingency(ctab)
Steps
# Create 'tips' df using pydataset.
tips = data('tips')
tips.head()
$H_0$ = Whether a person is a smoker is independent of his/her sex.
$H_a$ = Whether a person is a smoker is not independent of his/her sex.
confidence_level = .95
alpha = round(1 - confidence_level,2)
alpha
# Create the crosstab.
ctab = pd.crosstab(tips.smoker, tips.sex)
ctab
Chi2, p, degf, expected = stats.chi2_contingency(ctab)
print(f'Our p-value is {p}.')
print(f'Our p-value is less than our alpha: {p < alpha}')
$H_0$ = Whether a person churns is independent of the package type they purchase.
$H_a$ = Whether a person churns is not independent of the package type they purchase.
index = ['Churn', 'No Churn']
columns = ['Product A', 'Product B']
observed = pd.DataFrame([[100, 50], [120, 28]], index=index, columns=columns)
observed
Chi2, p, degf, expected = stats.chi2_contingency(observed)
print(f'Our p-value is {p}.')
print(f'Our p-value is less than our alpha: {p < alpha}')