Creating a class of objects from a group of functions
I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.
I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.
My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!
Thanks in advance everyone!
Below are the functions I have defined:
def __describe(df, col):
df_desc = df[col].describe()
return df_desc
def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df
def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def grouper(df, by):
df = df.groupby(by)
return df
def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(df, by):
df = df.sort_values(by)
return df
def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)
def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)
When I call one of the functions, say:
summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')
Everything works as expected, I get the following output:
Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:
class PreClinicalData(object):
def __init__(self, df):
self.df = df
def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc
def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df
def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
def grouper(self, by):
return self.df.groupby(by)
def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(self, by):
return self.df.sort_values(by)
def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
I can instantiate an instance of the class as follows:
bodyweight = PreClinicalData(chickv_data['bodyweight'])
And I am able to call the DataFrame and use the head()
method just fine.:
bodyweight.df.head()
Now, when I call the same method as shown above...I get the following:
bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")
which yields the following error:
I have no clue where these alleged three arguments are coming from?!
python-3.x pandas oop dataframe pandas-groupby
|
show 8 more comments
I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.
I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.
My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!
Thanks in advance everyone!
Below are the functions I have defined:
def __describe(df, col):
df_desc = df[col].describe()
return df_desc
def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df
def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def grouper(df, by):
df = df.groupby(by)
return df
def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(df, by):
df = df.sort_values(by)
return df
def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)
def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)
When I call one of the functions, say:
summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')
Everything works as expected, I get the following output:
Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:
class PreClinicalData(object):
def __init__(self, df):
self.df = df
def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc
def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df
def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
def grouper(self, by):
return self.df.groupby(by)
def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(self, by):
return self.df.sort_values(by)
def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
I can instantiate an instance of the class as follows:
bodyweight = PreClinicalData(chickv_data['bodyweight'])
And I am able to call the DataFrame and use the head()
method just fine.:
bodyweight.df.head()
Now, when I call the same method as shown above...I get the following:
bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")
which yields the following error:
I have no clue where these alleged three arguments are coming from?!
python-3.x pandas oop dataframe pandas-groupby
1
Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…
– user3471881
Nov 27 '18 at 14:27
2
Don't passself
togrouper
: it's justself.grouper(by)
. if you passself
it will getself
(automatically) thenself
, thenby
, hence the 3 args.
– progmatico
Nov 27 '18 at 15:48
1
Why are you using double-underscore name-mangling, e.g.__describe
? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the__describe
method? Or, did you read somewhere that__
is "private" in Python?
– juanpa.arrivillaga
Nov 27 '18 at 21:30
2
So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.
– juanpa.arrivillaga
Nov 27 '18 at 22:19
2
Think aboutPreclinicalData(df).sns_box_plot('y', 'y-label')
. Why is this better thansns_box_plot(df, 'y', 'y-label')
?
– user3471881
Nov 28 '18 at 8:43
|
show 8 more comments
I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.
I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.
My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!
Thanks in advance everyone!
Below are the functions I have defined:
def __describe(df, col):
df_desc = df[col].describe()
return df_desc
def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df
def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def grouper(df, by):
df = df.groupby(by)
return df
def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(df, by):
df = df.sort_values(by)
return df
def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)
def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)
When I call one of the functions, say:
summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')
Everything works as expected, I get the following output:
Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:
class PreClinicalData(object):
def __init__(self, df):
self.df = df
def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc
def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df
def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
def grouper(self, by):
return self.df.groupby(by)
def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(self, by):
return self.df.sort_values(by)
def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
I can instantiate an instance of the class as follows:
bodyweight = PreClinicalData(chickv_data['bodyweight'])
And I am able to call the DataFrame and use the head()
method just fine.:
bodyweight.df.head()
Now, when I call the same method as shown above...I get the following:
bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")
which yields the following error:
I have no clue where these alleged three arguments are coming from?!
python-3.x pandas oop dataframe pandas-groupby
I'm very new to OOP, and want to start getting in the habit of writing modular, reusable code. Sorry for the wall of code and text, just trying to give as much context and be as clear as possible.
I have defined a series of functions, all which act on a pandas DataFrame, and I wish to create a class from this series of functions. I thought it would be a good idea to create a class because I will be using this code on DataFrames that are pretty standard going forward.
My question is twofold:
1. Should I be using OOP concepts and design for this problem
2. What the hell is causing my darn code to break?!
Thanks in advance everyone!
Below are the functions I have defined:
def __describe(df, col):
df_desc = df[col].describe()
return df_desc
def __sort_by_character_study_day(df):
new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
df.index = df.index.set_levels(new_index_values, level='study_day')
df = df.sort_index()
return df
def change_from_baseline(df, col):
df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
groupedf = grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def grouper(df, by):
df = df.groupby(by)
return df
def sns_box_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(df, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(df, by):
df = df.sort_values(by)
return df
def summary_stats(df, by, col):
dfgrouped = grouper(df, by)
df_desc = __describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return __sort_by_character_study_day(df_summ)
def summary_stat_formatted(df, by, col):
data_summ = summary_stats(df, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(df, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
df.to_excel(writer, sheet_name=sheetname)
When I call one of the functions, say:
summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')
Everything works as expected, I get the following output:
Now, since I will be using these same functions on nearly identical DataFrames (only thing different is the name of the dataframe and the analysis column), I thought it might be a good idea to create a class (please let me know if this is the right approach). My class looks like this:
class PreClinicalData(object):
def __init__(self, df):
self.df = df
def __describe(self, col):
df_desc = self.df[col].describe()
return df_desc
def __sort_by_character_study_day(self):
new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
self.df = self.df.sort_index()
return self.df
def change_from_baseline(self, col):
df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
groupedf = self.grouper(df=df_sorted, by='Animal_id')
df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
return df_sorted
def summary_stats(self, by, col):
dfgrouped = self.grouper(df=self.df, by=by)
df_desc = self.__describe(dfgrouped, col=col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
def grouper(self, by):
return self.df.groupby(by)
def sns_box_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.boxplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow',
hue_order=["Group 1", "Group 2", "Group 3"])
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sns_bar_plot(self, col, y_lab):
plt.subplots(figsize=(12,8))
sns.set_style("whitegrid")
g = sns.barplot(x="study_day", y = col, hue="group_c",
data = self.df, palette='rainbow')
plt.ylabel(y_lab, fontsize=20)
plt.xlabel("Study Day", fontsize=20)
plt.tick_params('both', labelsize='14')
plt.show()
def sort_df(self, by):
return self.df.sort_values(by)
def summary_stats(self, by, col):
dfgrouped = self.grouper(self, by)
df_desc = self.__describe(dfgrouped, col)
df_summ = pd.DataFrame()
df_summ["count"] = df_desc['count']
df_summ["mean"] = df_desc['mean'].round(2).astype(str)
df_summ["std"] = df_desc['std'].round(2).astype(str)
df_summ["25%"] = df_desc['25%'].round(2).astype(str)
df_summ["50%"] = df_desc['50%'].round(2).astype(str)
df_summ["75%"] = df_desc['75%'].round(2).astype(str)
df_summ["min"] = df_desc['min'].round(2).astype(str)
df_summ["max"] = df_desc['max'].round(2).astype(str)
return self.__sort_by_character_study_day(df_summ)
def summary_stat_formatted(self, by, col):
data_summ = self.summary_stats(self, by, col)
formatted_summ = pd.DataFrame()
formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'
formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
return formatted_summ
def write_to_excel(self, outfile, sheetname):
writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
self.df.to_excel(writer, sheet_name=sheetname)
I can instantiate an instance of the class as follows:
bodyweight = PreClinicalData(chickv_data['bodyweight'])
And I am able to call the DataFrame and use the head()
method just fine.:
bodyweight.df.head()
Now, when I call the same method as shown above...I get the following:
bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")
which yields the following error:
I have no clue where these alleged three arguments are coming from?!
python-3.x pandas oop dataframe pandas-groupby
python-3.x pandas oop dataframe pandas-groupby
asked Nov 27 '18 at 13:19
TheCuriouslyCodingFoxahTheCuriouslyCodingFoxah
647
647
1
Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…
– user3471881
Nov 27 '18 at 14:27
2
Don't passself
togrouper
: it's justself.grouper(by)
. if you passself
it will getself
(automatically) thenself
, thenby
, hence the 3 args.
– progmatico
Nov 27 '18 at 15:48
1
Why are you using double-underscore name-mangling, e.g.__describe
? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the__describe
method? Or, did you read somewhere that__
is "private" in Python?
– juanpa.arrivillaga
Nov 27 '18 at 21:30
2
So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.
– juanpa.arrivillaga
Nov 27 '18 at 22:19
2
Think aboutPreclinicalData(df).sns_box_plot('y', 'y-label')
. Why is this better thansns_box_plot(df, 'y', 'y-label')
?
– user3471881
Nov 28 '18 at 8:43
|
show 8 more comments
1
Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…
– user3471881
Nov 27 '18 at 14:27
2
Don't passself
togrouper
: it's justself.grouper(by)
. if you passself
it will getself
(automatically) thenself
, thenby
, hence the 3 args.
– progmatico
Nov 27 '18 at 15:48
1
Why are you using double-underscore name-mangling, e.g.__describe
? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the__describe
method? Or, did you read somewhere that__
is "private" in Python?
– juanpa.arrivillaga
Nov 27 '18 at 21:30
2
So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.
– juanpa.arrivillaga
Nov 27 '18 at 22:19
2
Think aboutPreclinicalData(df).sns_box_plot('y', 'y-label')
. Why is this better thansns_box_plot(df, 'y', 'y-label')
?
– user3471881
Nov 28 '18 at 8:43
1
1
Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…
– user3471881
Nov 27 '18 at 14:27
Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…
– user3471881
Nov 27 '18 at 14:27
2
2
Don't pass
self
to grouper
: it's just self.grouper(by)
. if you pass self
it will get self
(automatically) then self
, then by
, hence the 3 args.– progmatico
Nov 27 '18 at 15:48
Don't pass
self
to grouper
: it's just self.grouper(by)
. if you pass self
it will get self
(automatically) then self
, then by
, hence the 3 args.– progmatico
Nov 27 '18 at 15:48
1
1
Why are you using double-underscore name-mangling, e.g.
__describe
? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe
method? Or, did you read somewhere that __
is "private" in Python?– juanpa.arrivillaga
Nov 27 '18 at 21:30
Why are you using double-underscore name-mangling, e.g.
__describe
? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the __describe
method? Or, did you read somewhere that __
is "private" in Python?– juanpa.arrivillaga
Nov 27 '18 at 21:30
2
2
So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.
– juanpa.arrivillaga
Nov 27 '18 at 22:19
So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.
– juanpa.arrivillaga
Nov 27 '18 at 22:19
2
2
Think about
PreclinicalData(df).sns_box_plot('y', 'y-label')
. Why is this better than sns_box_plot(df, 'y', 'y-label')
?– user3471881
Nov 28 '18 at 8:43
Think about
PreclinicalData(df).sns_box_plot('y', 'y-label')
. Why is this better than sns_box_plot(df, 'y', 'y-label')
?– user3471881
Nov 28 '18 at 8:43
|
show 8 more comments
0
active
oldest
votes
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53500653%2fcreating-a-class-of-objects-from-a-group-of-functions%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
0
active
oldest
votes
0
active
oldest
votes
active
oldest
votes
active
oldest
votes
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53500653%2fcreating-a-class-of-objects-from-a-group-of-functions%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
Monkey-patch instead of defining an entire new class that you have to initiate (again) with a df: pandas.pydata.org/pandas-docs/version/0.15/…
– user3471881
Nov 27 '18 at 14:27
2
Don't pass
self
togrouper
: it's justself.grouper(by)
. if you passself
it will getself
(automatically) thenself
, thenby
, hence the 3 args.– progmatico
Nov 27 '18 at 15:48
1
Why are you using double-underscore name-mangling, e.g.
__describe
? Do you want double-underscore name-mangling, i.e., you suspect your class will be subclasses and want to hide the__describe
method? Or, did you read somewhere that__
is "private" in Python?– juanpa.arrivillaga
Nov 27 '18 at 21:30
2
So, very important to understand, python does not have private/public variables. If you want to let other developers know that an attribute is nnot part of the public api, conventionally you use a single underscore. That being said, I am surprised that fixed your issue.
– juanpa.arrivillaga
Nov 27 '18 at 22:19
2
Think about
PreclinicalData(df).sns_box_plot('y', 'y-label')
. Why is this better thansns_box_plot(df, 'y', 'y-label')
?– user3471881
Nov 28 '18 at 8:43