柚子快報邀請碼778899分享:聚類 機(jī)器學(xué)習(xí) 深度學(xué)習(xí) K
柚子快報邀請碼778899分享:聚類 機(jī)器學(xué)習(xí) 深度學(xué)習(xí) K
主要用于保存記錄,來源B站視頻跟著大佬做的。
實(shí)現(xiàn)效果:
1.原始數(shù)據(jù)分類? 2.原始數(shù)據(jù)未分類
1.原始數(shù)據(jù)分類? ?2.數(shù)據(jù)聚類后(‘x’是每個聚類的中心)
demo:
### demo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from k_means import KMeans
# 導(dǎo)入數(shù)據(jù)
data = pd.read_csv("./iris.csv")
iris_types = ['setosa','versicolor','virginica'] # 數(shù)據(jù)類別
x_axis = 'Petal_Length' #x軸
y_axis = 'Petal_Width' #Y軸
# 圖的大小
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
for iris_type in iris_types: # 遍歷每種花
plt.scatter(data[x_axis][data['Species']==iris_type],data[y_axis][data['Species']==iris_type],label=iris_type)
plt.title('label know')
plt.legend()
plt.subplot(122)
plt.scatter(data[x_axis][:],data[y_axis][:])
plt.title('label unknow')
plt.show()
num_examples = data.shape[0] #樣本個數(shù)
x_train = data[[x_axis,y_axis]].values.reshape(num_examples, 2)#將data的x,y軸數(shù)據(jù)轉(zhuǎn)換成ndarry格式,并reshape,賦給x_train
#指定好訓(xùn)練所需的參數(shù)
num_clusters = 3
max_iterition = 50
KMeans = KMeans(x_train,num_clusters)
centroids, closest_centroids_ids = KMeans.train(max_iterition)
#畫圖對比
plt.figure(figsize=(12,5)) # 圖的大小
plt.subplot(1,2,1)
for iris_type in iris_types: # 遍歷每種花
plt.scatter(data[x_axis][data ['Species']==iris_type],data[y_axis][data['Species']==iris_type],label=iris_type)
plt.title('label know')
plt.legend()
plt.subplot(1,2,2)
for centroids_id, centroid in enumerate(centroids):
current_examples_index = (closest_centroids_ids == centroids_id).flatten()
plt.scatter(data[x_axis][current_examples_index], data[y_axis][current_examples_index], label=centroids_id)
for centroids_id, centroid in enumerate(centroids):
plt.scatter(centroid[0],centroid[1],c='black',marker='x')
plt.title('label KMeans')
plt.legend()
plt.show()
K_means.py:
### K_means
import numpy as np
# closest_centroids_ids里面存的是什么? ——>存的是每個數(shù)據(jù)距離中心點(diǎn)最小距離的簇的索引(1,2,3,...,K)
class KMeans:
def __init__(self, data, num_clustres): # 數(shù)據(jù)、K值
self.data = data
self.num_clustres = num_clustres
def train(self, max_iterations): # 最大迭代次數(shù)
# 1在當(dāng)前數(shù)據(jù)中,初始化隨機(jī)選擇K個中心點(diǎn)
centroids = KMeans.centroids_init(self.data, self.num_clustres)
# 2開始訓(xùn)練
# 計算每個點(diǎn)到K個中心點(diǎn)的距離
num_examples = self.data.shape[0] # 數(shù)據(jù)個數(shù)
closest_centroids_ids = np.empty((num_examples, 1)) # 創(chuàng)建空數(shù)組,存放尋找的最近的中心點(diǎn)
for _ in range(max_iterations): # 迭代max_iterations次
# 3得到當(dāng)前每個樣本點(diǎn)到K個中心點(diǎn)的距離,找到最近的距離id
closest_centroids_ids = KMeans.centroids_find_closest(self.data, centroids) # 數(shù)據(jù)、中心點(diǎn)
# 4進(jìn)行中心點(diǎn)位置更新:closest_centroids_ids距離誰最近就屬于哪個堆
centroids = KMeans.centroids_computer(self.data, closest_centroids_ids, self.num_clustres) #數(shù)據(jù)、當(dāng)前數(shù)據(jù)距離哪個點(diǎn)最近
return centroids, closest_centroids_ids
@staticmethod
# num_clustres()方法:初始化隨機(jī)的尋找當(dāng)前data的num_clustres個中心點(diǎn)
def centroids_init(data, num_clustres):
num_examples = data.shape[0] # 數(shù)據(jù)個數(shù)
random_ids = np.random.permutation(num_examples) # permutation:將數(shù)據(jù)隨機(jī)排序
# random_ids[:num_clustres]:選擇從0到num_clustres個數(shù)據(jù),前num_clustres個;后面的:表示所有的數(shù)據(jù)特征
data = np.array(data)
centroids = data[random_ids[:num_clustres], :] # 選取后作為中心點(diǎn)
return centroids
@staticmethod
# 計算距離中心的的最近的距離,存儲在closest_centroids_ids中,返回
def centroids_find_closest(data, centroids):
# 計算方法:eg.歐氏距離
num_examples = data.shape[0] # 數(shù)據(jù)個數(shù)
num_centroids = centroids.shape[0] # 簇的個數(shù):K
closest_centroids_ids = np.zeros((num_examples, 1)) # 用于存儲每個樣本點(diǎn)距離簇最近的那一個
for examples_index in range(num_examples):
distance = np.zeros((num_centroids, 1)) # 存儲每個樣本點(diǎn)對K個簇的distance值
for centroids_index in range(num_centroids):
# 計算距離:數(shù)據(jù)點(diǎn)的坐標(biāo)值-中心點(diǎn)的坐標(biāo)值
data = np.array(data)
distance_diff = data[examples_index, :4] - centroids[centroids_index, :4]
distance[centroids_index] = np.sum(distance_diff ** 2)
# 對當(dāng)前的樣本找到距離最近的那個簇
closest_centroids_ids[examples_index] = np.argmin(distance) # np.argmin()返回最小值索引
return closest_centroids_ids
@staticmethod
# 更新closest_centroids_ids里面距離誰最近,就放在哪個堆(簇)
def centroids_computer(data, closest_centroids_ids, num_clustres):
num_features = data.shape[1] # 輸入的數(shù)據(jù)的特征個數(shù)
# num_features: 用于各個特征分別算均值
centroids = np.zeros((num_clustres,num_features)) # 簇的個數(shù)、特征個數(shù)
for centroids_id in range(num_clustres):
###如何運(yùn)行的?
closest_ids = closest_centroids_ids == centroids_id
centroids[centroids_id] = np.mean(data[closest_ids.flatten(),:], axis=0)
return centroids
柚子快報邀請碼778899分享:聚類 機(jī)器學(xué)習(xí) 深度學(xué)習(xí) K
參考閱讀
本文內(nèi)容根據(jù)網(wǎng)絡(luò)資料整理,出于傳遞更多信息之目的,不代表金鑰匙跨境贊同其觀點(diǎn)和立場。
轉(zhuǎn)載請注明,如有侵權(quán),聯(lián)系刪除。