K Mean Clustering from scratch (Python)
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import time
start_time = time.time()
style.use('ggplot')
class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
def fit(self, data):
self.centroids = {}
#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]
#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] =
#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False
#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break
def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
def main():
#df = pd.read_csv(r"ipl.csv")
df = pd.read_csv(r"CustomerData4.csv",nrows=200)
#df = df[['one', 'two']]
df=df[['MRank','FRank','RRank']]
dataset = df.astype(float).values.tolist()
X = df.values
#df
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array
km = K_Means(5)
km.fit(X)
#y_kmeansP=km.fit(X)
# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
#prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')
for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")
for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
print(classification)
df['Cluster'] = classification
plt.scatter(features[0], features[1], color = color,s = 30)
df.to_csv("clusteringfromscrtach.csv")
#plt.show()
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
This is the code for K Mean Clustering from scratch
I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
Other clusters are 0 1 2 3
Any solution to this problem
python cluster-analysis
add a comment |
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import time
start_time = time.time()
style.use('ggplot')
class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
def fit(self, data):
self.centroids = {}
#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]
#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] =
#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False
#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break
def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
def main():
#df = pd.read_csv(r"ipl.csv")
df = pd.read_csv(r"CustomerData4.csv",nrows=200)
#df = df[['one', 'two']]
df=df[['MRank','FRank','RRank']]
dataset = df.astype(float).values.tolist()
X = df.values
#df
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array
km = K_Means(5)
km.fit(X)
#y_kmeansP=km.fit(X)
# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
#prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')
for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")
for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
print(classification)
df['Cluster'] = classification
plt.scatter(features[0], features[1], color = color,s = 30)
df.to_csv("clusteringfromscrtach.csv")
#plt.show()
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
This is the code for K Mean Clustering from scratch
I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
Other clusters are 0 1 2 3
Any solution to this problem
python cluster-analysis
add a comment |
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import time
start_time = time.time()
style.use('ggplot')
class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
def fit(self, data):
self.centroids = {}
#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]
#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] =
#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False
#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break
def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
def main():
#df = pd.read_csv(r"ipl.csv")
df = pd.read_csv(r"CustomerData4.csv",nrows=200)
#df = df[['one', 'two']]
df=df[['MRank','FRank','RRank']]
dataset = df.astype(float).values.tolist()
X = df.values
#df
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array
km = K_Means(5)
km.fit(X)
#y_kmeansP=km.fit(X)
# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
#prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')
for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")
for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
print(classification)
df['Cluster'] = classification
plt.scatter(features[0], features[1], color = color,s = 30)
df.to_csv("clusteringfromscrtach.csv")
#plt.show()
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
This is the code for K Mean Clustering from scratch
I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
Other clusters are 0 1 2 3
Any solution to this problem
python cluster-analysis
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import time
start_time = time.time()
style.use('ggplot')
class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations
def fit(self, data):
self.centroids = {}
#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]
#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] =
#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)
previous = dict(self.centroids)
#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)
isOptimal = True
for centroid in self.centroids:
original_centroid = previous[centroid]
curr = self.centroids[centroid]
if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False
#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break
def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification
def main():
#df = pd.read_csv(r"ipl.csv")
df = pd.read_csv(r"CustomerData4.csv",nrows=200)
#df = df[['one', 'two']]
df=df[['MRank','FRank','RRank']]
dataset = df.astype(float).values.tolist()
X = df.values
#df
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array
km = K_Means(5)
km.fit(X)
#y_kmeansP=km.fit(X)
# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
#prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')
for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")
for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
print(classification)
df['Cluster'] = classification
plt.scatter(features[0], features[1], color = color,s = 30)
df.to_csv("clusteringfromscrtach.csv")
#plt.show()
print("--- %s seconds ---" % (time.time() - start_time))
if __name__ == "__main__":
main()
This is the code for K Mean Clustering from scratch
I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
Other clusters are 0 1 2 3
Any solution to this problem
python cluster-analysis
python cluster-analysis
asked Nov 24 '18 at 4:50
Usman RafiqUsman Rafiq
45
45
add a comment |
add a comment |
1 Answer
1
active
oldest
votes
df['Cluster'] = classification
Obviously you are overwriting this column k times.
Instead, combine the results into one column.
Benchmark your code on larger data, too...
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53455229%2fk-mean-clustering-from-scratch-python%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
df['Cluster'] = classification
Obviously you are overwriting this column k times.
Instead, combine the results into one column.
Benchmark your code on larger data, too...
add a comment |
df['Cluster'] = classification
Obviously you are overwriting this column k times.
Instead, combine the results into one column.
Benchmark your code on larger data, too...
add a comment |
df['Cluster'] = classification
Obviously you are overwriting this column k times.
Instead, combine the results into one column.
Benchmark your code on larger data, too...
df['Cluster'] = classification
Obviously you are overwriting this column k times.
Instead, combine the results into one column.
Benchmark your code on larger data, too...
answered Nov 24 '18 at 19:15
Anony-MousseAnony-Mousse
57.5k796159
57.5k796159
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53455229%2fk-mean-clustering-from-scratch-python%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown