K Mean Clustering from scratch (Python)












-1















 import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import time

start_time = time.time()

style.use('ggplot')

class K_Means:
def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
self.k = k
self.tolerance = tolerance
self.max_iterations = max_iterations

def fit(self, data):

self.centroids = {}

#initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
for i in range(self.k):
self.centroids[i] = data[i]

#begin iterations
for i in range(self.max_iterations):
self.classes = {}
for i in range(self.k):
self.classes[i] =

#find the distance between the point and cluster; choose the nearest centroid
for features in data:
distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
self.classes[classification].append(features)

previous = dict(self.centroids)

#average the cluster datapoints to re-calculate the centroids
for classification in self.classes:
self.centroids[classification] = np.average(self.classes[classification], axis = 0)

isOptimal = True

for centroid in self.centroids:

original_centroid = previous[centroid]
curr = self.centroids[centroid]

if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
isOptimal = False

#break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
if isOptimal:
break

def pred(self, data):
distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
classification = distances.index(min(distances))
return classification

def main():


#df = pd.read_csv(r"ipl.csv")
df = pd.read_csv(r"CustomerData4.csv",nrows=200)
#df = df[['one', 'two']]
df=df[['MRank','FRank','RRank']]
dataset = df.astype(float).values.tolist()
X = df.values

#df
dataset = df.astype(float).values.tolist()
X = df.values #returns a numpy array

km = K_Means(5)

km.fit(X)
#y_kmeansP=km.fit(X)

# Plotting starts here
colors = 10*["r", "g", "c", "b", "k"]
#prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')

for centroid in km.centroids:
plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")




for classification in km.classes:
color = colors[classification]
for features in km.classes[classification]:
print(classification)
df['Cluster'] = classification
plt.scatter(features[0], features[1], color = color,s = 30)


df.to_csv("clusteringfromscrtach.csv")
#plt.show()
print("--- %s seconds ---" % (time.time() - start_time))

if __name__ == "__main__":
main()


This is the code for K Mean Clustering from scratch
I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
Other clusters are 0 1 2 3
Any solution to this problem










share|improve this question



























    -1















     import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib import style
    import pandas as pd
    import time

    start_time = time.time()

    style.use('ggplot')

    class K_Means:
    def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
    self.k = k
    self.tolerance = tolerance
    self.max_iterations = max_iterations

    def fit(self, data):

    self.centroids = {}

    #initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
    for i in range(self.k):
    self.centroids[i] = data[i]

    #begin iterations
    for i in range(self.max_iterations):
    self.classes = {}
    for i in range(self.k):
    self.classes[i] =

    #find the distance between the point and cluster; choose the nearest centroid
    for features in data:
    distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
    classification = distances.index(min(distances))
    self.classes[classification].append(features)

    previous = dict(self.centroids)

    #average the cluster datapoints to re-calculate the centroids
    for classification in self.classes:
    self.centroids[classification] = np.average(self.classes[classification], axis = 0)

    isOptimal = True

    for centroid in self.centroids:

    original_centroid = previous[centroid]
    curr = self.centroids[centroid]

    if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
    isOptimal = False

    #break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
    if isOptimal:
    break

    def pred(self, data):
    distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
    classification = distances.index(min(distances))
    return classification

    def main():


    #df = pd.read_csv(r"ipl.csv")
    df = pd.read_csv(r"CustomerData4.csv",nrows=200)
    #df = df[['one', 'two']]
    df=df[['MRank','FRank','RRank']]
    dataset = df.astype(float).values.tolist()
    X = df.values

    #df
    dataset = df.astype(float).values.tolist()
    X = df.values #returns a numpy array

    km = K_Means(5)

    km.fit(X)
    #y_kmeansP=km.fit(X)

    # Plotting starts here
    colors = 10*["r", "g", "c", "b", "k"]
    #prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')

    for centroid in km.centroids:
    plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")




    for classification in km.classes:
    color = colors[classification]
    for features in km.classes[classification]:
    print(classification)
    df['Cluster'] = classification
    plt.scatter(features[0], features[1], color = color,s = 30)


    df.to_csv("clusteringfromscrtach.csv")
    #plt.show()
    print("--- %s seconds ---" % (time.time() - start_time))

    if __name__ == "__main__":
    main()


    This is the code for K Mean Clustering from scratch
    I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
    Other clusters are 0 1 2 3
    Any solution to this problem










    share|improve this question

























      -1












      -1








      -1








       import numpy as np
      import matplotlib.pyplot as plt
      from matplotlib import style
      import pandas as pd
      import time

      start_time = time.time()

      style.use('ggplot')

      class K_Means:
      def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
      self.k = k
      self.tolerance = tolerance
      self.max_iterations = max_iterations

      def fit(self, data):

      self.centroids = {}

      #initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
      for i in range(self.k):
      self.centroids[i] = data[i]

      #begin iterations
      for i in range(self.max_iterations):
      self.classes = {}
      for i in range(self.k):
      self.classes[i] =

      #find the distance between the point and cluster; choose the nearest centroid
      for features in data:
      distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
      classification = distances.index(min(distances))
      self.classes[classification].append(features)

      previous = dict(self.centroids)

      #average the cluster datapoints to re-calculate the centroids
      for classification in self.classes:
      self.centroids[classification] = np.average(self.classes[classification], axis = 0)

      isOptimal = True

      for centroid in self.centroids:

      original_centroid = previous[centroid]
      curr = self.centroids[centroid]

      if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
      isOptimal = False

      #break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
      if isOptimal:
      break

      def pred(self, data):
      distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
      classification = distances.index(min(distances))
      return classification

      def main():


      #df = pd.read_csv(r"ipl.csv")
      df = pd.read_csv(r"CustomerData4.csv",nrows=200)
      #df = df[['one', 'two']]
      df=df[['MRank','FRank','RRank']]
      dataset = df.astype(float).values.tolist()
      X = df.values

      #df
      dataset = df.astype(float).values.tolist()
      X = df.values #returns a numpy array

      km = K_Means(5)

      km.fit(X)
      #y_kmeansP=km.fit(X)

      # Plotting starts here
      colors = 10*["r", "g", "c", "b", "k"]
      #prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')

      for centroid in km.centroids:
      plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")




      for classification in km.classes:
      color = colors[classification]
      for features in km.classes[classification]:
      print(classification)
      df['Cluster'] = classification
      plt.scatter(features[0], features[1], color = color,s = 30)


      df.to_csv("clusteringfromscrtach.csv")
      #plt.show()
      print("--- %s seconds ---" % (time.time() - start_time))

      if __name__ == "__main__":
      main()


      This is the code for K Mean Clustering from scratch
      I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
      Other clusters are 0 1 2 3
      Any solution to this problem










      share|improve this question














       import numpy as np
      import matplotlib.pyplot as plt
      from matplotlib import style
      import pandas as pd
      import time

      start_time = time.time()

      style.use('ggplot')

      class K_Means:
      def __init__(self, k =3, tolerance = 0.0001, max_iterations = 500):
      self.k = k
      self.tolerance = tolerance
      self.max_iterations = max_iterations

      def fit(self, data):

      self.centroids = {}

      #initialize the centroids, the first 'k' elements in the dataset will be our initial centroids
      for i in range(self.k):
      self.centroids[i] = data[i]

      #begin iterations
      for i in range(self.max_iterations):
      self.classes = {}
      for i in range(self.k):
      self.classes[i] =

      #find the distance between the point and cluster; choose the nearest centroid
      for features in data:
      distances = [np.linalg.norm(features - self.centroids[centroid]) for centroid in self.centroids]
      classification = distances.index(min(distances))
      self.classes[classification].append(features)

      previous = dict(self.centroids)

      #average the cluster datapoints to re-calculate the centroids
      for classification in self.classes:
      self.centroids[classification] = np.average(self.classes[classification], axis = 0)

      isOptimal = True

      for centroid in self.centroids:

      original_centroid = previous[centroid]
      curr = self.centroids[centroid]

      if np.sum((curr - original_centroid)/original_centroid * 100.0) > self.tolerance:
      isOptimal = False

      #break out of the main loop if the results are optimal, ie. the centroids don't change their positions much(more than our tolerance)
      if isOptimal:
      break

      def pred(self, data):
      distances = [np.linalg.norm(data - self.centroids[centroid]) for centroid in self.centroids]
      classification = distances.index(min(distances))
      return classification

      def main():


      #df = pd.read_csv(r"ipl.csv")
      df = pd.read_csv(r"CustomerData4.csv",nrows=200)
      #df = df[['one', 'two']]
      df=df[['MRank','FRank','RRank']]
      dataset = df.astype(float).values.tolist()
      X = df.values

      #df
      dataset = df.astype(float).values.tolist()
      X = df.values #returns a numpy array

      km = K_Means(5)

      km.fit(X)
      #y_kmeansP=km.fit(X)

      # Plotting starts here
      colors = 10*["r", "g", "c", "b", "k"]
      #prediction = pd.DataFrame(km.fit(X), columns=['predictions']).to_csv('prediction.csv')

      for centroid in km.centroids:
      plt.scatter(km.centroids[centroid][0], km.centroids[centroid][1], s = 130, marker = "x")




      for classification in km.classes:
      color = colors[classification]
      for features in km.classes[classification]:
      print(classification)
      df['Cluster'] = classification
      plt.scatter(features[0], features[1], color = color,s = 30)


      df.to_csv("clusteringfromscrtach.csv")
      #plt.show()
      print("--- %s seconds ---" % (time.time() - start_time))

      if __name__ == "__main__":
      main()


      This is the code for K Mean Clustering from scratch
      I want to export my data frame with one column added which is cluster I used that line tdf['Cluster'] = classification to add new column named Cluster to my dataframe but it only added one cluster '4'
      Other clusters are 0 1 2 3
      Any solution to this problem







      python cluster-analysis






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Nov 24 '18 at 4:50









      Usman RafiqUsman Rafiq

      45




      45
























          1 Answer
          1






          active

          oldest

          votes


















          0














          df['Cluster'] = classification


          Obviously you are overwriting this column k times.



          Instead, combine the results into one column.



          Benchmark your code on larger data, too...






          share|improve this answer























            Your Answer






            StackExchange.ifUsing("editor", function () {
            StackExchange.using("externalEditor", function () {
            StackExchange.using("snippets", function () {
            StackExchange.snippets.init();
            });
            });
            }, "code-snippets");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "1"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            autoActivateHeartbeat: false,
            convertImagesToLinks: true,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: 10,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53455229%2fk-mean-clustering-from-scratch-python%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            1 Answer
            1






            active

            oldest

            votes








            1 Answer
            1






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes









            0














            df['Cluster'] = classification


            Obviously you are overwriting this column k times.



            Instead, combine the results into one column.



            Benchmark your code on larger data, too...






            share|improve this answer




























              0














              df['Cluster'] = classification


              Obviously you are overwriting this column k times.



              Instead, combine the results into one column.



              Benchmark your code on larger data, too...






              share|improve this answer


























                0












                0








                0







                df['Cluster'] = classification


                Obviously you are overwriting this column k times.



                Instead, combine the results into one column.



                Benchmark your code on larger data, too...






                share|improve this answer













                df['Cluster'] = classification


                Obviously you are overwriting this column k times.



                Instead, combine the results into one column.



                Benchmark your code on larger data, too...







                share|improve this answer












                share|improve this answer



                share|improve this answer










                answered Nov 24 '18 at 19:15









                Anony-MousseAnony-Mousse

                57.5k796159




                57.5k796159






























                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Stack Overflow!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53455229%2fk-mean-clustering-from-scratch-python%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    A CLEAN and SIMPLE way to add appendices to Table of Contents and bookmarks

                    Calculate evaluation metrics using cross_val_predict sklearn

                    Insert data from modal to MySQL (multiple modal on website)