Creating a class of objects from a group of functions

I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.

I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.

My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!

Thanks in advance everyone!

Below are the functions I have defined:

def __describe(df, col):

    df_desc = df[col].describe()

    return df_desc



def __sort_by_character_study_day(df):        

    new_index_values = df.index.levels[0].str.split().str[-1].astype(int)

    df.index = df.index.set_levels(new_index_values, level='study_day')

    df = df.sort_index()

    return df



def change_from_baseline(df, col):

    df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])

    groupedf = grouper(df=df_sorted, by='Animal_id')

    df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

    df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

    return df_sorted



def grouper(df, by):

    df = df.groupby(by)

    return df



def sns_box_plot(df, col, y_lab):   

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow', 

                    hue_order=["Group 1", "Group 2", "Group 3"])

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sns_bar_plot(df, col, y_lab):       

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.barplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow')

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sort_df(df, by):

    df = df.sort_values(by)

    return df   



def summary_stats(df, by, col):

    dfgrouped = grouper(df, by)

    df_desc = __describe(dfgrouped, col)

    df_summ = pd.DataFrame()

    df_summ["count"] = df_desc['count']

    df_summ["mean"] = df_desc['mean'].round(2).astype(str)

    df_summ["std"] = df_desc['std'].round(2).astype(str)

    df_summ["25%"] = df_desc['25%'].round(2).astype(str)

    df_summ["50%"] = df_desc['50%'].round(2).astype(str)

    df_summ["75%"] = df_desc['75%'].round(2).astype(str)

    df_summ["min"] = df_desc['min'].round(2).astype(str)

    df_summ["max"] = df_desc['max'].round(2).astype(str)

    return __sort_by_character_study_day(df_summ)



def summary_stat_formatted(df, by, col):

    data_summ = summary_stats(df, by, col)

    formatted_summ = pd.DataFrame()

    formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

    formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

    formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

    formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

    return formatted_summ



def write_to_excel(df, outfile, sheetname):

    writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

    df.to_excel(writer, sheet_name=sheetname)

When I call one of the functions, say:

summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')

Everything works as expected, I get the following output:
enter image description here

Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:

class PreClinicalData(object):



    def __init__(self, df):

        self.df = df



    def __describe(self, col):

        df_desc = self.df[col].describe()

        return df_desc



    def __sort_by_character_study_day(self):

        new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)

        self.df.index = self.df.index.set_levels(new_index_values, level='study_day')

        self.df = self.df.sort_index()

        return self.df



    def change_from_baseline(self,  col):

        df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])

        groupedf = self.grouper(df=df_sorted, by='Animal_id')

        df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

        df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

        return df_sorted



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(df=self.df, by=by)

        df_desc = self.__describe(dfgrouped, col=col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)    



    def grouper(self, by):

        return self.df.groupby(by)



    def sns_box_plot(self, col, y_lab):   

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow', 

                        hue_order=["Group 1", "Group 2", "Group 3"])

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sns_bar_plot(self, col, y_lab):       

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.barplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow')

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sort_df(self, by):

        return self.df.sort_values(by)   



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(self, by)

        df_desc = self.__describe(dfgrouped, col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)

I can instantiate an instance of the class as follows:

bodyweight = PreClinicalData(chickv_data['bodyweight'])

And I am able to call the DataFrame and use the head() method just fine.:

bodyweight.df.head()

Now, when I call the same method as shown above...I get the following:

bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")

which yields the following error:

enter image description here

I have no clue where these alleged three arguments are coming from?!

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

1

Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27

2

Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48

1

Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30

2

So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19

2

Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43

|
show 8 more comments

I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.

My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!

Thanks in advance everyone!

Below are the functions I have defined:

def __describe(df, col):

    df_desc = df[col].describe()

    return df_desc



def __sort_by_character_study_day(df):        

    new_index_values = df.index.levels[0].str.split().str[-1].astype(int)

    df.index = df.index.set_levels(new_index_values, level='study_day')

    df = df.sort_index()

    return df



def change_from_baseline(df, col):

    df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])

    groupedf = grouper(df=df_sorted, by='Animal_id')

    df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

    df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

    return df_sorted



def grouper(df, by):

    df = df.groupby(by)

    return df



def sns_box_plot(df, col, y_lab):   

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow', 

                    hue_order=["Group 1", "Group 2", "Group 3"])

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sns_bar_plot(df, col, y_lab):       

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.barplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow')

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sort_df(df, by):

    df = df.sort_values(by)

    return df   



def summary_stats(df, by, col):

    dfgrouped = grouper(df, by)

    df_desc = __describe(dfgrouped, col)

    df_summ = pd.DataFrame()

    df_summ["count"] = df_desc['count']

    df_summ["mean"] = df_desc['mean'].round(2).astype(str)

    df_summ["std"] = df_desc['std'].round(2).astype(str)

    df_summ["25%"] = df_desc['25%'].round(2).astype(str)

    df_summ["50%"] = df_desc['50%'].round(2).astype(str)

    df_summ["75%"] = df_desc['75%'].round(2).astype(str)

    df_summ["min"] = df_desc['min'].round(2).astype(str)

    df_summ["max"] = df_desc['max'].round(2).astype(str)

    return __sort_by_character_study_day(df_summ)



def summary_stat_formatted(df, by, col):

    data_summ = summary_stats(df, by, col)

    formatted_summ = pd.DataFrame()

    formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

    formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

    formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

    formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

    return formatted_summ



def write_to_excel(df, outfile, sheetname):

    writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

    df.to_excel(writer, sheet_name=sheetname)

When I call one of the functions, say:

summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')

Everything works as expected, I get the following output:
enter image description here

class PreClinicalData(object):



    def __init__(self, df):

        self.df = df



    def __describe(self, col):

        df_desc = self.df[col].describe()

        return df_desc



    def __sort_by_character_study_day(self):

        new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)

        self.df.index = self.df.index.set_levels(new_index_values, level='study_day')

        self.df = self.df.sort_index()

        return self.df



    def change_from_baseline(self,  col):

        df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])

        groupedf = self.grouper(df=df_sorted, by='Animal_id')

        df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

        df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

        return df_sorted



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(df=self.df, by=by)

        df_desc = self.__describe(dfgrouped, col=col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)    



    def grouper(self, by):

        return self.df.groupby(by)



    def sns_box_plot(self, col, y_lab):   

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow', 

                        hue_order=["Group 1", "Group 2", "Group 3"])

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sns_bar_plot(self, col, y_lab):       

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.barplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow')

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sort_df(self, by):

        return self.df.sort_values(by)   



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(self, by)

        df_desc = self.__describe(dfgrouped, col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)

I can instantiate an instance of the class as follows:

bodyweight = PreClinicalData(chickv_data['bodyweight'])

And I am able to call the DataFrame and use the head() method just fine.:

bodyweight.df.head()

Now, when I call the same method as shown above...I get the following:

bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")

which yields the following error:

enter image description here

I have no clue where these alleged three arguments are coming from?!

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

1

Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27

2

Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48

1

Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30

2

So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19

2

Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43

|
show 8 more comments

I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.

My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!

Thanks in advance everyone!

Below are the functions I have defined:

def __describe(df, col):

    df_desc = df[col].describe()

    return df_desc



def __sort_by_character_study_day(df):        

    new_index_values = df.index.levels[0].str.split().str[-1].astype(int)

    df.index = df.index.set_levels(new_index_values, level='study_day')

    df = df.sort_index()

    return df



def change_from_baseline(df, col):

    df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])

    groupedf = grouper(df=df_sorted, by='Animal_id')

    df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

    df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

    return df_sorted



def grouper(df, by):

    df = df.groupby(by)

    return df



def sns_box_plot(df, col, y_lab):   

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow', 

                    hue_order=["Group 1", "Group 2", "Group 3"])

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sns_bar_plot(df, col, y_lab):       

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.barplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow')

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sort_df(df, by):

    df = df.sort_values(by)

    return df   



def summary_stats(df, by, col):

    dfgrouped = grouper(df, by)

    df_desc = __describe(dfgrouped, col)

    df_summ = pd.DataFrame()

    df_summ["count"] = df_desc['count']

    df_summ["mean"] = df_desc['mean'].round(2).astype(str)

    df_summ["std"] = df_desc['std'].round(2).astype(str)

    df_summ["25%"] = df_desc['25%'].round(2).astype(str)

    df_summ["50%"] = df_desc['50%'].round(2).astype(str)

    df_summ["75%"] = df_desc['75%'].round(2).astype(str)

    df_summ["min"] = df_desc['min'].round(2).astype(str)

    df_summ["max"] = df_desc['max'].round(2).astype(str)

    return __sort_by_character_study_day(df_summ)



def summary_stat_formatted(df, by, col):

    data_summ = summary_stats(df, by, col)

    formatted_summ = pd.DataFrame()

    formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

    formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

    formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

    formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

    return formatted_summ



def write_to_excel(df, outfile, sheetname):

    writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

    df.to_excel(writer, sheet_name=sheetname)

When I call one of the functions, say:

summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')

Everything works as expected, I get the following output:
enter image description here

class PreClinicalData(object):



    def __init__(self, df):

        self.df = df



    def __describe(self, col):

        df_desc = self.df[col].describe()

        return df_desc



    def __sort_by_character_study_day(self):

        new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)

        self.df.index = self.df.index.set_levels(new_index_values, level='study_day')

        self.df = self.df.sort_index()

        return self.df



    def change_from_baseline(self,  col):

        df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])

        groupedf = self.grouper(df=df_sorted, by='Animal_id')

        df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

        df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

        return df_sorted



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(df=self.df, by=by)

        df_desc = self.__describe(dfgrouped, col=col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)    



    def grouper(self, by):

        return self.df.groupby(by)



    def sns_box_plot(self, col, y_lab):   

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow', 

                        hue_order=["Group 1", "Group 2", "Group 3"])

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sns_bar_plot(self, col, y_lab):       

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.barplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow')

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sort_df(self, by):

        return self.df.sort_values(by)   



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(self, by)

        df_desc = self.__describe(dfgrouped, col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)

I can instantiate an instance of the class as follows:

bodyweight = PreClinicalData(chickv_data['bodyweight'])

And I am able to call the DataFrame and use the head() method just fine.:

bodyweight.df.head()

Now, when I call the same method as shown above...I get the following:

bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")

which yields the following error:

enter image description here

I have no clue where these alleged three arguments are coming from?!

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.

My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!

Thanks in advance everyone!

Below are the functions I have defined:

def __describe(df, col):

    df_desc = df[col].describe()

    return df_desc



def __sort_by_character_study_day(df):        

    new_index_values = df.index.levels[0].str.split().str[-1].astype(int)

    df.index = df.index.set_levels(new_index_values, level='study_day')

    df = df.sort_index()

    return df



def change_from_baseline(df, col):

    df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])

    groupedf = grouper(df=df_sorted, by='Animal_id')

    df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

    df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

    return df_sorted



def grouper(df, by):

    df = df.groupby(by)

    return df



def sns_box_plot(df, col, y_lab):   

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow', 

                    hue_order=["Group 1", "Group 2", "Group 3"])

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sns_bar_plot(df, col, y_lab):       

    plt.subplots(figsize=(12,8))

    sns.set_style("whitegrid")

    g = sns.barplot(x="study_day", y = col, hue="group_c", 

                    data = df, palette='rainbow')

    plt.ylabel(y_lab, fontsize=20)

    plt.xlabel("Study Day", fontsize=20)

    plt.tick_params('both', labelsize='14')

    plt.show()



def sort_df(df, by):

    df = df.sort_values(by)

    return df   



def summary_stats(df, by, col):

    dfgrouped = grouper(df, by)

    df_desc = __describe(dfgrouped, col)

    df_summ = pd.DataFrame()

    df_summ["count"] = df_desc['count']

    df_summ["mean"] = df_desc['mean'].round(2).astype(str)

    df_summ["std"] = df_desc['std'].round(2).astype(str)

    df_summ["25%"] = df_desc['25%'].round(2).astype(str)

    df_summ["50%"] = df_desc['50%'].round(2).astype(str)

    df_summ["75%"] = df_desc['75%'].round(2).astype(str)

    df_summ["min"] = df_desc['min'].round(2).astype(str)

    df_summ["max"] = df_desc['max'].round(2).astype(str)

    return __sort_by_character_study_day(df_summ)



def summary_stat_formatted(df, by, col):

    data_summ = summary_stats(df, by, col)

    formatted_summ = pd.DataFrame()

    formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

    formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

    formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

    formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

    return formatted_summ



def write_to_excel(df, outfile, sheetname):

    writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

    df.to_excel(writer, sheet_name=sheetname)

When I call one of the functions, say:

summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')

Everything works as expected, I get the following output:
enter image description here

class PreClinicalData(object):



    def __init__(self, df):

        self.df = df



    def __describe(self, col):

        df_desc = self.df[col].describe()

        return df_desc



    def __sort_by_character_study_day(self):

        new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)

        self.df.index = self.df.index.set_levels(new_index_values, level='study_day')

        self.df = self.df.sort_index()

        return self.df



    def change_from_baseline(self,  col):

        df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])

        groupedf = self.grouper(df=df_sorted, by='Animal_id')

        df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')

        df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100

        return df_sorted



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(df=self.df, by=by)

        df_desc = self.__describe(dfgrouped, col=col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)    



    def grouper(self, by):

        return self.df.groupby(by)



    def sns_box_plot(self, col, y_lab):   

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.boxplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow', 

                        hue_order=["Group 1", "Group 2", "Group 3"])

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sns_bar_plot(self, col, y_lab):       

        plt.subplots(figsize=(12,8))

        sns.set_style("whitegrid")

        g = sns.barplot(x="study_day", y = col, hue="group_c", 

                        data = self.df, palette='rainbow')

        plt.ylabel(y_lab, fontsize=20)

        plt.xlabel("Study Day", fontsize=20)

        plt.tick_params('both', labelsize='14')

        plt.show()



    def sort_df(self, by):

        return self.df.sort_values(by)   



    def summary_stats(self, by, col):

        dfgrouped = self.grouper(self, by)

        df_desc = self.__describe(dfgrouped, col)

        df_summ = pd.DataFrame()

        df_summ["count"] = df_desc['count']

        df_summ["mean"] = df_desc['mean'].round(2).astype(str)

        df_summ["std"] = df_desc['std'].round(2).astype(str)

        df_summ["25%"] = df_desc['25%'].round(2).astype(str)

        df_summ["50%"] = df_desc['50%'].round(2).astype(str)

        df_summ["75%"] = df_desc['75%'].round(2).astype(str)

        df_summ["min"] = df_desc['min'].round(2).astype(str)

        df_summ["max"] = df_desc['max'].round(2).astype(str)

        return self.__sort_by_character_study_day(df_summ)



    def summary_stat_formatted(self, by, col):

        data_summ = self.summary_stats(self, by, col)

        formatted_summ = pd.DataFrame()

        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)

        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'

        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  

        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]

        return formatted_summ



    def write_to_excel(self, outfile, sheetname):

        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')

        self.df.to_excel(writer, sheet_name=sheetname)

I can instantiate an instance of the class as follows:

bodyweight = PreClinicalData(chickv_data['bodyweight'])

And I am able to call the DataFrame and use the head() method just fine.:

bodyweight.df.head()

Now, when I call the same method as shown above...I get the following:

bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")

which yields the following error:

enter image description here

I have no clue where these alleged three arguments are coming from?!

python-3.x pandas oop dataframe pandas-groupby

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

asked Nov 27 '18 at 13:19

TheCuriouslyCodingFoxah

647

1

Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27

2

Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48

1

Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30

2

So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19

2

Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43

|
show 8 more comments

1

Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27

2

Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48

1

Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30

2

So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19

2

Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43

Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27

Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48

Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30

So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19

Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43

|
show 8 more comments

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53500653%2fcreating-a-class-of-objects-from-a-group-of-functions%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Btukfyl