Creating a class of objects from a group of functions












1















I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.



I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.



My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!



Thanks in advance everyone!



Below are the functions I have defined:



def __describe(df, col):
df_desc = df[col].describe()
return df_desc

def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df

def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def grouper(df, by):
df = df.groupby(by)
return df

def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(df, by):
df = df.sort_values(by)
return df

def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)

def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)


When I call one of the functions, say:



summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')


Everything works as expected, I get the following output:
enter image description here



Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:



class PreClinicalData(object):

def __init__(self, df):
self.df = df

def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc

def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df

def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)

def grouper(self, by):
return self.df.groupby(by)

def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(self, by):
return self.df.sort_values(by)

def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)


I can instantiate an instance of the class as follows:



bodyweight = PreClinicalData(chickv_data['bodyweight'])


And I am able to call the DataFrame and use the head() method just fine.:



bodyweight.df.head()


Now, when I call the same method as shown above...I get the following:



bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")


which yields the following error:



enter image description here



I have no clue where these alleged three arguments are coming from?!










share|improve this question


















  • 1





    Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

    – user3471881
    Nov 27 '18 at 14:27








  • 2





    Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

    – progmatico
    Nov 27 '18 at 15:48






  • 1





    Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

    – juanpa.arrivillaga
    Nov 27 '18 at 21:30






  • 2





    So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

    – juanpa.arrivillaga
    Nov 27 '18 at 22:19








  • 2





    Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

    – user3471881
    Nov 28 '18 at 8:43
















1















I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.



I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.



My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!



Thanks in advance everyone!



Below are the functions I have defined:



def __describe(df, col):
df_desc = df[col].describe()
return df_desc

def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df

def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def grouper(df, by):
df = df.groupby(by)
return df

def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(df, by):
df = df.sort_values(by)
return df

def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)

def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)


When I call one of the functions, say:



summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')


Everything works as expected, I get the following output:
enter image description here



Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:



class PreClinicalData(object):

def __init__(self, df):
self.df = df

def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc

def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df

def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)

def grouper(self, by):
return self.df.groupby(by)

def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(self, by):
return self.df.sort_values(by)

def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)


I can instantiate an instance of the class as follows:



bodyweight = PreClinicalData(chickv_data['bodyweight'])


And I am able to call the DataFrame and use the head() method just fine.:



bodyweight.df.head()


Now, when I call the same method as shown above...I get the following:



bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")


which yields the following error:



enter image description here



I have no clue where these alleged three arguments are coming from?!










share|improve this question


















  • 1





    Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

    – user3471881
    Nov 27 '18 at 14:27








  • 2





    Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

    – progmatico
    Nov 27 '18 at 15:48






  • 1





    Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

    – juanpa.arrivillaga
    Nov 27 '18 at 21:30






  • 2





    So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

    – juanpa.arrivillaga
    Nov 27 '18 at 22:19








  • 2





    Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

    – user3471881
    Nov 28 '18 at 8:43














1












1








1








I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.



I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.



My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!



Thanks in advance everyone!



Below are the functions I have defined:



def __describe(df, col):
df_desc = df[col].describe()
return df_desc

def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df

def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def grouper(df, by):
df = df.groupby(by)
return df

def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(df, by):
df = df.sort_values(by)
return df

def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)

def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)


When I call one of the functions, say:



summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')


Everything works as expected, I get the following output:
enter image description here



Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:



class PreClinicalData(object):

def __init__(self, df):
self.df = df

def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc

def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df

def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)

def grouper(self, by):
return self.df.groupby(by)

def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(self, by):
return self.df.sort_values(by)

def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)


I can instantiate an instance of the class as follows:



bodyweight = PreClinicalData(chickv_data['bodyweight'])


And I am able to call the DataFrame and use the head() method just fine.:



bodyweight.df.head()


Now, when I call the same method as shown above...I get the following:



bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")


which yields the following error:



enter image description here



I have no clue where these alleged three arguments are coming from?!










share|improve this question














I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.



I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.



My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!



Thanks in advance everyone!



Below are the functions I have defined:



def __describe(df, col):
df_desc = df[col].describe()
return df_desc

def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df

def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def grouper(df, by):
df = df.groupby(by)
return df

def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(df, by):
df = df.sort_values(by)
return df

def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)

def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)


When I call one of the functions, say:



summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')


Everything works as expected, I get the following output:
enter image description here



Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:



class PreClinicalData(object):

def __init__(self, df):
self.df = df

def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc

def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df

def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted

def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)

def grouper(self, by):
return self.df.groupby(by)

def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()

def sort_df(self, by):
return self.df.sort_values(by)

def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)

def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ

def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)


I can instantiate an instance of the class as follows:



bodyweight = PreClinicalData(chickv_data['bodyweight'])


And I am able to call the DataFrame and use the head() method just fine.:



bodyweight.df.head()


Now, when I call the same method as shown above...I get the following:



bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")


which yields the following error:



enter image description here



I have no clue where these alleged three arguments are coming from?!







python-3.x pandas oop dataframe pandas-groupby






share|improve this question













share|improve this question











share|improve this question




share|improve this question










asked Nov 27 '18 at 13:19









TheCuriouslyCodingFoxahTheCuriouslyCodingFoxah

647




647








  • 1





    Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

    – user3471881
    Nov 27 '18 at 14:27








  • 2





    Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

    – progmatico
    Nov 27 '18 at 15:48






  • 1





    Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

    – juanpa.arrivillaga
    Nov 27 '18 at 21:30






  • 2





    So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

    – juanpa.arrivillaga
    Nov 27 '18 at 22:19








  • 2





    Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

    – user3471881
    Nov 28 '18 at 8:43














  • 1





    Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

    – user3471881
    Nov 27 '18 at 14:27








  • 2





    Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

    – progmatico
    Nov 27 '18 at 15:48






  • 1





    Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

    – juanpa.arrivillaga
    Nov 27 '18 at 21:30






  • 2





    So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

    – juanpa.arrivillaga
    Nov 27 '18 at 22:19








  • 2





    Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

    – user3471881
    Nov 28 '18 at 8:43








1




1





Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27







Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…

– user3471881
Nov 27 '18 at 14:27






2




2





Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48





Don't pass self to grouper: it's just self.grouper(by). if you pass self it will get self(automatically) then self, then by, hence the 3 args.

– progmatico
Nov 27 '18 at 15:48




1




1





Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30





Why are you using double-underscore name-mangling, e.g. __describe? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe method? Or, did you read somewhere that __ is "private" in Python?

– juanpa.arrivillaga
Nov 27 '18 at 21:30




2




2





So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19







So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.

– juanpa.arrivillaga
Nov 27 '18 at 22:19






2




2





Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43





Think about PreclinicalData(df).sns_box_plot('y', 'y-label'). Why is this better than sns_box_plot(df, 'y', 'y-label')?

– user3471881
Nov 28 '18 at 8:43












0






active

oldest

votes











Your Answer






StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});














draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53500653%2fcreating-a-class-of-objects-from-a-group-of-functions%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























0






active

oldest

votes








0






active

oldest

votes









active

oldest

votes






active

oldest

votes
















draft saved

draft discarded




















































Thanks for contributing an answer to Stack Overflow!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53500653%2fcreating-a-class-of-objects-from-a-group-of-functions%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

A CLEAN and SIMPLE way to add appendices to Table of Contents and bookmarks

Calculate evaluation metrics using cross_val_predict sklearn

Insert data from modal to MySQL (multiple modal on website)