# Libraries to help with reading and manipulating data
import numpy as np
import pandas as pd

# Libraries to help with data visualization
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

# Library to help with statistical analysis
import scipy.stats as stats


# removing the limit for the number of displayed columns
pd.set_option("display.max_columns", None) # To set column limits replace None with a number
# setting the limit for the number of displayed rows
pd.set_option("display.max_rows", 25) # To set row limits replace None with a number
# setting the precision of floating numbers to 2 decimal points
pd.set_option("display.float_format", lambda x: "%.2f" % x)


# Purpose: Calculate various discrete statistical values for a specific column in a DataFrame
#
# Prerequisites:
#    Requires the developer to only send data that discrete statistics can safely be calculated for.
#    This function would require more extensive data validation checks and more robust exception handling.
#
# Inputs
#    data   : DataFrame object containing rows and columns of data
#    feature: str representing the column name to run statistics on
#
def calculate_statistics (data, feature):
        
    # Only calculate and print statistics if the feature is a single column string name and data is a DataFrame
    if isinstance(data,pd.DataFrame) and type(feature) == str:
        
        # For future, would like to use Describe to pull data types for each column
        # Then only perform the calculations and prints if of type Int64 or Float64
        
        # Calculate and print various discrete statistical values         
        print(f"Discrete Statistics for {feature}\n")
        print(f"Mean              : {data[feature].mean():.6f}")
        print(f"Mode              : {data[feature].mode()[0]}")
        print(f"Median            : {data[feature].median()}")
        print(f"Min               : {data[feature].min()}")
        print(f"Max               : {data[feature].max()}")   
        print(f"Standard Deviation: {data[feature].std():.6f}")
        print(f"Percentiles       : \n{data[feature].quantile([.25,.50,.75])}")


# Purpose: Create histogram plot to visualize the distribution of continuous numerical data
#      by dividing the data into bins and displaying the frequency of observations within each bin.
#      Histogram is useful for understanding the underlining distribution, shape, central tendency, 
#      and spread of the data.
#
# Inputs:
#
#      input_data: DataFrame object containing rows and columns of data
#      feature: str representing the column name to plot a histogram for
#      in_kde: boolean value; True: plot the kde density line; False: do not plot the kde line
#

def histogram(input_data, feature, in_kde=True):
    
    # Only proceed if the feature is a single column string name and data is a DataFrame
    if isinstance(input_data,pd.DataFrame) and type(feature) == str:
        
        #stores the x axis left and right buffer size
        buffer=5
            
        #set the x limits based on the minimum and maximum values for x-axis feature
        xmin_value = int(input_data[feature].min())
        xmax_value = int(input_data[feature].max())
        plt.xlim(xmin_value-buffer,xmax_value+buffer)
        
        #plot the histogram using the buffersize
        sns.histplot(data=input_data, x=feature, kde=in_kde);
        
        #set the title, x and y labels
        plt.title('Histogram of '+ feature)
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        
        plt.show()


# Purpose: Create a Boxplot, for a particular column/feature, to visually summarize the distribution
#      of a continuous numerical variable and to identify potential outliers within the data.
#
# Inputs:
#
#      input_data: DataFrame object containing rows and columns of data
#      feature: str representing the column name to plot a count plot graph for
#      in_vert: False (display horizontal); True (display vertically)
#      in_showfliers: True (show the outliers); False (do not show outliers)
#      in_showlabels: True (show boxplot labels for Max,Q3,Median,Q1, and Min); False (do not show labels)
#

def boxplot(input_data,feature,in_vert=False,in_showfliers=False, in_showlabels=True):
    
    #label translater key
    label_translator = {'caps 0': 'Min', 'caps 1': 'Max', 'whiskers 0': 'Q1', 'whiskers 1': 'Q3','medians 0':'Median', 'boxes 0':'Box'}
    
    # Only proceed if data is a DataFrame
    if isinstance(input_data,pd.DataFrame):
 
        ax = plt.boxplot(df[feature], vert=in_vert,showfliers=in_showfliers) 
         
        plt.title('Boxplot of '+ feature)
        plt.xlabel(feature)
        
        # Revisit this later to figure out a more robust way to get the min x value for labels.
        # Right now, the median has the smallest x value when in vert mode
        min_x = ax['medians'][0].get_xydata()[0][0] - .05;
         
        #show labels for the main boxplot lines (e.g, Max Line, Q1, Q3, Min Line)
        for i in ax.keys():
            for index,line in enumerate(ax[i]):
                
                # for some reason boxes is redundant for Q1. Filter this value out. We're also not interested in 
                # labels for outliers
                if (i != 'boxes' and i != 'fliers'):
                    line_x,line_y=line.get_xydata()[0]
                    
                    #if needed and in vertical mode, show the key boxplot labels
                    #revisit later to get labels working in horizontal mode
                    if (in_showlabels == True and in_vert == True):
                        label_key = f"{i} {index}"
                        label = f"{label_translator[label_key]}:{line_y:.2f}"
                        plt.text(min_x,line_y,label,ha='right',va='center',color='blue',fontsize=9)
        
        plt.show()


# Purpose: Create a countplot; used to visualize counts for categorical data
#
# Inputs:
#
#     input_data: DataFrame object containing rows and columns of data
#     feature: str representing the column name to plot a count plot graph for (category column)
#     show_perc: value from [0,1] indicating the top % values to show in the countplot
#     label_count: True (show count labels); False (show percentage labels)
#
def countplot (input_data, feature, show_perc=1.0, label_count=True):
    if isinstance(input_data,pd.DataFrame) and type(feature) == str:
                
        #Set the figure size. However, revist this later to see if there is a robust way to increase the 
        #figure size based on the number of x-axis labels.
        plt.figure(figsize=(10, 6))   
        
        #Perform a total counts, which sorts the list in descending order. Then grab the list of columns
        order_cols = input_data[feature].value_counts().index
        
        #Use the percentage value to determine how many of the top columns to show
        num_to_show = int(len(order_cols)*show_perc)
        
        #Grab the top columns to show in the count plot
        cols_to_show = input_data[feature].value_counts().nlargest(num_to_show).index
                
        #plot the top columns
        cp=sns.countplot(data=input_data,x=feature,order=cols_to_show)
        
        #rotate x labels for better readability
        plt.xticks(rotation=90)
        
        #set the title
        plt_title = f"Countplot for the top {show_perc*100}% {feature} values"
        plt.title(plt_title)
        
        # Apply some simple label formatting; remove the underscores and replace with a blank space
        cp.set_xlabel(feature.replace('_', ' ').title(), fontsize=15)
        
        #show values for each bar/patch. The labels will either be numerical values or percentages.
        for p in cp.patches:
            
            total_values = len(input_data[feature])
            
            #show count labels
            if label_count == True:
                label = p.get_height() 
            else:
                # show percentage label
                label = "{:.1f}%".format(100 * p.get_height() / total_values)
            
            cp.annotate(
                label, 
                (p.get_x()+p.get_width()/2.,p.get_height()),
                ha='center',
                va='center',
                xytext=(0,5), # set the label offset above the bar
                textcoords='offset points'
            )

        plt.show()


# Purpose: Create Boxplot for multiple variables (x being a categorical value)
#
# Inputs:
#
#     in_data: DataFrame object containing rows and columns of data
#     x_feature: str representing the column name for the x-axis (categorical data)
#     y_feature: str representing the column name for the y-axis
#
def multi_boxplot (in_data, x_feature, y_feature):

    # Only proceed if the features is a single column string name and data is a DataFrame
    if isinstance(in_data,pd.DataFrame) and type(x_feature) == str and type(y_feature) == str:

        # visualizing the relationship between two featgures
        plt.figure(figsize=(12, 5))
        sns.boxplot(data=df, x=x_feature, y=y_feature, showmeans=True)
        plt.xticks(fontsize=15)
        plt.yticks(fontsize=15)
        plt.xticks(rotation='vertical')
        plt.xlabel(x_feature, fontsize=15)
        plt.ylabel(y_feature, fontsize=15);
        
        plt.show()


# Purpose: Create a heatmap which supports multivariate analysis of 2+ numerical features. Heatmaps are
#     useful for identifying correlations between 2 or more variables.
#
# Inputs:
#
#     input_data: DataFrame object containing rows and columns of data
#
def heatmap (input_data):
    if isinstance(input_data,pd.DataFrame): #and type(feature) == str:
        plt.figure(figsize=(10,8))
        
        # Select only the numerical columns
        numeric_columns = input_data.select_dtypes(include=np.number)

        sns.heatmap(numeric_columns.corr(),annot=True,cmap='Spectral',vmin=-1,vmax=1)
        plt.show()


# Read the ABTest data set into a panda dataframe object
df = pd.read_csv("./abtest.csv")


# Verify the data file was read correctly by displaying the first five rows.
df.head()


# Verify the entire data file was read correctly by displaying the last five rows.
df.tail()


# Print out the number of rows and columns in the data file.
df.shape
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.")

There are 100 rows and 6 columns.


# Print out basic information on the data file.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_id                 100 non-null    int64  
 1   group                   100 non-null    object 
 2   landing_page            100 non-null    object 
 3   time_spent_on_the_page  100 non-null    float64
 4   converted               100 non-null    object 
 5   language_preferred      100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


# Print out some basic discrete statistics on the ABTest data
df.describe(include='all').T


# Check for missing values.
df.isnull().sum()

user_id                   0
group                     0
landing_page              0
time_spent_on_the_page    0
converted                 0
language_preferred        0
dtype: int64


# Check for duplicate values
df.nunique()

user_id                   100
group                       2
landing_page                2
time_spent_on_the_page     94
converted                   2
language_preferred          3
dtype: int64


#The control group should always be given the old landing page.  Ensure the data reflects this given.
df_control = df[(df['group'] == 'control') & (df['landing_page'] != 'old')]
print(f"There are {len(df_control)} control and old value data inconsistencies.")

#The treatment group should always be given the new landing page.  Ensure the data reflects this given.
df_treatment = df[(df['group'] == 'treatment') & (df['landing_page'] != 'new')]
print(f"There are {len(df_treatment)} treatment and new value inconsistencies.")

There are 0 control and old value data inconsistencies.
There are 0 treatment and new value inconsistencies.


# Group field univariate analysis

#Create histogram plot for the top groups
countplot(df,'group',show_perc=1.0,label_count=True)

#Print the total number of unique restaurants
print(f"Number of groups: {df['group'].nunique()}")

Number of groups: 2


# Landing_page field univariate analysis

#Create histogram plot for the top groups
countplot(df,'landing_page',show_perc=1.0,label_count=True)

#Print the total number of unique restaurants
print(f"Number of landing pages: {df['landing_page'].nunique()}")

Number of landing pages: 2


# time_spent_on_the_page field univariate analysis
selected_column = 'time_spent_on_the_page'

#calculate univariate statistics
calculate_statistics(df,selected_column) 

#show histogram for cost_of_the_order
histogram(df,selected_column)

#show boxplot for cost_of_the_order
boxplot(df,selected_column,in_vert=True, in_showfliers=False,in_showlabels=True)

Discrete Statistics for time_spent_on_the_page

Mean              : 5.377800
Mode              : 0.4
Median            : 5.415
Min               : 0.19
Max               : 10.71
Standard Deviation: 2.378166
Percentiles       : 
0.25   3.88
0.50   5.42
0.75   7.02
Name: time_spent_on_the_page, dtype: float64


# Converted field univariate analysis

#Create histogram plot for the top groups
countplot(df,'converted',show_perc=1.0,label_count=True)

#Print the total number of unique restaurants
print(f"Number of converted values: {df['converted'].nunique()}")

Number of converted values: 2


# language_preferred field univariate analysis

#Create histogram plot for the top groups
countplot(df,'language_preferred',show_perc=1.0,label_count=True)

#Print the total number of unique restaurants
print(f"Number of language_preferred values: {df['language_preferred'].nunique()}")

Number of language_preferred values: 3


# visualizing the relationship between landing_page and time_spent_on_the_page
multi_boxplot(in_data=df,x_feature="landing_page", y_feature="time_spent_on_the_page")


# visualizing the relationship between language_preferred and time_spent_on_the_page
multi_boxplot(in_data=df,x_feature="language_preferred", y_feature="time_spent_on_the_page")


# visualizing the relationship between converted and time_spent_on_the_page
multi_boxplot(in_data=df,x_feature="converted", y_feature="time_spent_on_the_page")


# Plot the number of converted values (yes or no) for each landing page (old or new)
sns.countplot(data=df, x='landing_page', hue='converted');


# visualizing the relationship between the old and new landing page and the total time spent on the page.
multi_boxplot(in_data=df,x_feature="landing_page", y_feature="time_spent_on_the_page")


# Collect the time spent values for the new landing page
time_spent_new_landing = df[df['landing_page']=='new']['time_spent_on_the_page']

# Collect the time spent values for the old landing page
time_spent_old_landing= df[df['landing_page']=='old']['time_spent_on_the_page']

# Calculate the standing deviations to determine how to set the equal_var parameter below.
print (f"The standard deviation for time spent on the new landing page is {round(time_spent_new_landing.std(),4)}")
print (f"The standard deviation for time spent on the old landing page is {round(time_spent_old_landing.std(),4)}")

The standard deviation for time spent on the new landing page is 1.817
The standard deviation for time spent on the old landing page is 2.582


#import the required t-test function
from scipy.stats import ttest_ind

# find the p-value
test_stat, p_value = ttest_ind(time_spent_new_landing, time_spent_old_landing, equal_var = False, alternative = 'greater')
print(f"The p-value is {round(p_value,8)}")

The p-value is 0.00013924


if (p_value < 0.05):
    print(f'As the p-value {round(p_value,8)} is less than the level of significance; we reject the null hypothesis.')
else:
    print(f'As the p-value {round(p_value,8)} is greater than the level of significance; we fail to reject the null hypothesis.')

As the p-value 0.00013924 is less than the level of significance; we reject the null hypothesis.


# Create a crosstab table for landing page vs converted
df_crosstab = pd.crosstab(df['landing_page'],df['converted'])  # ,normalize='index'
df_crosstab


# create a stacked bar plot to compare the distributions of both the categorical features
df_crosstab.plot(kind='bar',stacked =True)
plt.legend()
plt.show()


# Get the number of people who converted from the new landing page
new_converted = df[df['landing_page']=='new']['converted'].value_counts()['yes']

# Get the number of people who converted from the old landing page
old_converted = df[df['landing_page']=='old']['converted'].value_counts()['yes']

# Create the list of conversion values from the new and old landing pages
conversions = [new_converted, old_converted]

# Get the total number of new observations
new_obs = df[df['landing_page']=='new']['landing_page'].value_counts()['new']

# Get the total number of old observations
old_obs = df[df['landing_page']=='old']['landing_page'].value_counts()['old']

nobs = [new_obs, old_obs]

print(f"Number of new landing page conversions: {new_converted}")
print(f"Number of old landing page conversions: {old_converted}")
print(f"Number of new landing page observations: {new_obs}")
print(f"Number of old landing page observations: {old_obs}")

Number of new landing page conversions: 33
Number of old landing page conversions: 21
Number of new landing page observations: 50
Number of old landing page observations: 50


#import the required functions
from statsmodels.stats.proportion import proportions_ztest

test_stat, p_value = proportions_ztest(conversions,nobs, alternative='larger')
print(f"p_value is {round(p_value,8)}")

p_value is 0.00802631


if (p_value < .05):
    print(f"The p_value of {round(p_value,8)} is less than .05. Therefore, reject the null hypothesis.")
else:
    print(f"The p_value of {round(p_value,8)} is greater than .05. Therefore, there is not enough statistical evidence to reject the null hypothesis.")

The p_value of 0.00802631 is less than .05. Therefore, reject the null hypothesis.


# Create a crosstab table for landing page vs converted
df_crosstab = pd.crosstab(df['language_preferred'],df['converted'])
df_crosstab


# create a stacked bar plot to compare the distributions of both the categorical features
df_crosstab.plot(kind='bar',stacked =True)
plt.legend()
plt.show()


#import the required functions
from scipy.stats import chi2_contingency

chi, p_value, dof, expected = chi2_contingency(df_crosstab)
print (f"The p_value is {round(p_value,8)}.")

The p_value is 0.21298887.


if (p_value < .05):
    print(f"The p_value of {round(p_value,8)} is less than .05. Therefore, reject the null hypothesis.")
else:
    print(f"The p_value of {round(p_value,8)} is greater than .05. Therefore, there is not enough statistical evidence to reject the null hypothesis.")

The p_value of 0.21298887 is greater than .05. Therefore, there is not enough statistical evidence to reject the null hypothesis.


#create a new datafram with just the new landing_page data; this will include data from all preferred languages
df_new = df[df['landing_page']=='new']
df_new.head()


# visualizing the relationship between preferred languages (from the new landing page) vs time_spent_on_the_page
multi_boxplot(in_data=df_new,x_feature="language_preferred", y_feature="time_spent_on_the_page")


# Calculate the average time spent on the new landing page for each preferred language
mu = df_new.groupby(['language_preferred'])['time_spent_on_the_page'].mean()
mu

language_preferred
English   6.66
French    6.20
Spanish   5.84
Name: time_spent_on_the_page, dtype: float64


#import the required functions
from scipy.stats import shapiro

# Run Shapiro to test the validity of a normal distribution

statistic_val, p_value = shapiro(df_new['time_spent_on_the_page'])
print (f"The p_value is {round(p_value,8)}.")

The p_value is 0.80400163.


if (p_value < .05):
    print(f"The p_value of {round(p_value,8)} is less than .05. Therefore, reject the null hypothesis.")
else:
    print(f"The p_value of {round(p_value,8)} is greater than .05. Therefore, do not reject the null hypothesis.")

The p_value of 0.80400163 is greater than .05. Therefore, do not reject the null hypothesis.


# Run the Levene to test the validity of having the common variance
#import the required functions
from scipy.stats import levene

# For each language user, get the time spent on each page
df_new_spanish = df_new[df_new['language_preferred']=='Spanish']['time_spent_on_the_page']
df_new_english = df_new[df_new['language_preferred']=='English']['time_spent_on_the_page']
df_new_french = df_new[df_new['language_preferred']=='French']['time_spent_on_the_page']

# calculate the statistic and p value
statistic_val, p_value = levene (df_new_spanish, df_new_english, df_new_french)
print (f"The p_value is {round (p_value,8)}.")

The p_value is 0.46711358.


if (p_value < .05):
    print(f"The p_value of {round(p_value,8)} is less than .05. Therefore, reject the null hypothesis.")
else:
    print(f"The p_value of {round(p_value,8)} is greater than .05. Therefore, do not reject the null hypothesis.")

The p_value of 0.46711358 is greater than .05. Therefore, do not reject the null hypothesis.


from scipy.stats import f_oneway

test_stat, p_value = f_oneway(df_new_spanish, df_new_english, df_new_french)

print (f"The p_value is {round(p_value,8)}.")

The p_value is 0.43204139.


if (p_value < .05):
    print(f"The p_value of {round(p_value,6)} is less than .05. Therefore, reject the null hypothesis.")
else:
    print(f"The p_value of {round(p_value,6)} is greater than .05. Therefore, do not reject the null hypothesis.")

The p_value of 0.432041 is greater than .05. Therefore, do not reject the null hypothesis.

	count	unique	top	freq	mean	std	min	25%	50%	75%	max
user_id	100.00	NaN	NaN	NaN	546517.00	52.30	546443.00	546467.75	546492.50	546567.25	546592.00
group	100	2	control	50	NaN	NaN	NaN	NaN	NaN	NaN	NaN
landing_page	100	2	old	50	NaN	NaN	NaN	NaN	NaN	NaN	NaN
time_spent_on_the_page	100.00	NaN	NaN	NaN	5.38	2.38	0.19	3.88	5.42	7.02	10.71
converted	100	2	yes	54	NaN	NaN	NaN	NaN	NaN	NaN	NaN
language_preferred	100	3	Spanish	34	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	user_id	group	landing_page	time_spent_on_the_page	converted	language_preferred
0	546592	control	old	3.48	no	Spanish
1	546468	treatment	new	7.13	yes	English
2	546462	treatment	new	4.40	no	Spanish
3	546567	control	old	3.02	no	French
4	546459	treatment	new	4.75	yes	Spanish

	user_id	group	landing_page	time_spent_on_the_page	converted	language_preferred
95	546446	treatment	new	5.15	no	Spanish
96	546544	control	old	6.52	yes	English
97	546472	treatment	new	7.07	yes	Spanish
98	546481	treatment	new	6.20	yes	Spanish
99	546483	treatment	new	5.86	yes	English

Project Business Statistics: E-news Express¶

Preparation Information¶

Problem Statement and Objectives¶

Business Context:¶

Objective¶

Data Dictionary¶

Problem Definition¶

Import all the necessary libraries¶

Initialize some basic Panda configurations¶

Load some useful functions¶

Reading the Data into a DataFrame¶

Explore the dataset and extract insights using Exploratory Data Analysis¶

Observation(s):¶

Observation(s)¶

Observation¶

Check for Missing Values¶

Observation¶

Check for Duplicate Values¶

Observation(s):¶

Ensure the Control values match the Old landing page and the Treatment values match the New landing page.¶

Univariate Analysis¶

Observation(s):¶

Observation(s):¶

Observation(s)¶

Observation(s):¶

Observation(s):¶

Bivariate Analysis¶

Observations:¶

Observations:¶

Observation(s):¶

Observation(s)¶

1. Do the users spend more time on the new landing page than the existing landing page?¶

Perform Visual Analysis¶

Observation(s):¶

Step 1: Define the null and alternate hypotheses¶

Step 2: Select Appropriate test¶

Information:¶

Step 3: Decide the significance level¶

Step 4: Collect and prepare data¶

Step 5: Calculate the p-value¶

Step 6: Compare the p-value with $\alpha$¶

Step 7: Draw inference¶

Insight¶

2. Is the conversion rate (the proportion of users who visit the landing page and get converted) for the new page greater than the conversion rate for the old page?¶

Perform Visual Analysis¶

Observations¶

Step 1: Define the null and alternate hypotheses¶

Assumptions¶

Determine if the number of samples are sufficient enough:¶

Step 2: Select Appropriate test¶

Step 3: Decide the significance level¶

Step 4: Collect and prepare data¶

Step 5: Calculate the p-value¶

Step 6: Compare the p-value with $\alpha$¶

Step 7: Draw inference¶

3. Does the converted status depend on the preferred language?¶

Perform Visual Analysis¶

Observations:¶

Step 1: Define the null and alternate hypotheses¶

Assumptions:¶

Step 2: Select Appropriate test¶

Step 3: Decide the significance level¶

Step 4: Collect and prepare data¶

Step 5: Calculate the p-value¶

Step 6: Compare the p-value with $\alpha$¶

Step 7: Draw inference¶

4. Is the time spent on the new page the same for the different language users?¶

Perform Visual Analysis¶

Observations:¶

Step 1: Define the null and alternate hypotheses¶

Assumptions - Part 1 of 2¶

Insight¶

Insight¶

Assumptions - Part 2 of 2¶

Step 2: Select Appropriate test¶

Step 3: Decide the significance level¶

Step 4: Collect and prepare data¶

Step 5: Calculate the p-value¶

Step 6: Compare the p-value with $\alpha$¶

Step 7: Draw inference¶