def InitialCentroid(x, K):
c1_idx = int(np.random.uniform(0, len(x))) # Draw samples from a uniform distribution.
centroid = x[c1_idx].reshape(1, -1) # choice the first center for cluster.
k = 1
n = x.shape[0] # number of samples
while k < K:
d2 = []
for i in range(n):
subs = centroid - x[i, :] # D(x) = (x_1, y_1) - (x, y)
dimension2 = np.power(subs, 2) # D(x)^2
dimension_s = np.sum(dimension2, axis=1) # sum of each row
d2.append(np.min(dimension_s))
new_c_idx = np.argmax(d2)
centroid = np.vstack([centroid, x[new_c_idx]])
k += 1
return centroid
from yellowbrick.features import radviz
from yellowbrick.datasets import load_occupancy
X, y = load_occupancy()
visualizer = radviz(X, y, colors=["maroon", "gold"])
from yellowbrick.datasets import load_game
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import class_prediction_error
X, y = load_game()
X = OneHotEncoder().fit_transform(X)
visualizer = class_prediction_error(
RandomForestClassifier(n_estimators=10), X, y
)
from yellowbrick.datasets import load_credit
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import classification_report
X, y = load_credit()
visualizer = classification_report(
RandomForestClassifier(n_estimators=10), X, y
)
混淆矩阵(Confusion Matrix)
ConfusionMatrix 可视化器是一个 ScoreVisualizer,它使用拟合的 scikit-learn 分类器和一组测试 X 和 y 值,然后返回一个报告,显示每个测试值预测的类别与其实际类别的比较情况。
数据科学家使用混淆矩阵来了解哪些类最容易混淆。
from yellowbrick.datasets import load_game
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import RidgeClassifier
from yellowbrick.classifier import confusion_matrix
X, y = load_game()
X = OneHotEncoder().fit_transform(X)
visualizer = confusion_matrix(RidgeClassifier(), X, y, cmap="Greens")
from yellowbrick.classifier import discrimination_threshold
from sklearn.linear_model import LogisticRegression
from yellowbrick.datasets import load_spam
X, y = load_spam()
visualizer = discrimination_threshold(
LogisticRegression(multi_class="auto", solver="liblinear"), X, y
)
from sklearn.linear_model import Ridge
from yellowbrick.datasets import load_concrete
from yellowbrick.regressor import residuals_plot
X, y = load_concrete()
visualizer = residuals_plot(
Ridge(), X, y, train_color="maroon", test_color="gold"
)
预测误差图(Prediction Error)
预测误差图显示了数据集中的实际目标与我们的模型生成的预测值。 这使我们能够看到模型中有多少方差。
数据科学家可以使用此图通过与45度线进行比较来诊断回归模型,那个地方预测结果完全的匹配模型。
from sklearn.linear_model import Lasso
from yellowbrick.datasets import load_bikeshare
from yellowbrick.regressor import prediction_error
X, y = load_bikeshare()
visualizer = prediction_error(Lasso(), X, y)
from yellowbrick.datasets import load_game
from yellowbrick.target import class_balance
X, y = load_game()
visualizer = class_balance(y, labels=["draw", "loss", "win"])
R语言代码示例
library(magrittr)
library(tinyfuncr)
file <- commandArgs(TRUE)[[1]]
maxK <- 50L
filebase <- sub(".xls", "", file)
# read file
data <- read_tcsv(file)
data_mat <- data[6:ncol(data)]
rownames(data_mat) <- data[[4]]
######## Run PCA ########
pca <- prcomp(data_mat, center = TRUE, scale. = TRUE)
cat(paste0(filebase, "\n"))
summary(pca)
# select PC num
pc_num <- which(summary(pca)$importance[3, ] > 0.9)[[1]]
data_pc <- as.data.frame(pca$x)[1:pc_num]
cat(paste0(
"\nSelect the first ",
pc_num,
" of all PCs to explain >= 90% variance\n"
))
######## Elbow Method to Select K ########
sklearn <- reticulate::import("sklearn.cluster")
ybce <- reticulate::import("yellowbrick.cluster.elbow")
el <-
ybce$kelbow_visualizer(sklearn$KMeans(random_state = 4L, n_init = 1L),
data_pc,
k = maxK,
show = F)
k <- el$elbow_value_
score <- el$elbow_score_
cat(paste0(
"Elbow at K = ",
k,
", Score = ",
score,
" (testing with maxK=",
maxK,
")\n"
))
el$fig$savefig(
paste0(
filebase,
".PC",
pc_num,
".maxK",
maxK,
".KMeansDistortionScore.png"
),
dpi = 300,
bbox_inches = "tight"
)
######## Draw Silhouette Plot ########
sklearn <- reticulate::import("sklearn.cluster")
ybc <- reticulate::import("yellowbrick.cluster")
sp <-
ybc$silhouette_visualizer(sklearn$KMeans(as.integer(k), random_state = 4L, n_init = 1L),
data_pc,
show = F)
sp$fig$savefig(
paste0(filebase,
".PC",
pc_num,
".KMeans",
K,
".SilhouettePlot.png"),
dpi = 300,
bbox_inches = "tight"
)
######## Silhouette Score Peak ########
sklearn <- reticulate::import("sklearn.cluster")
ybce <- reticulate::import("yellowbrick.cluster.elbow")
el <-
ybce$kelbow_visualizer(
sklearn$KMeans(random_state = 4L, n_init = 1L),
data_pc,
k = maxK,
metric = "silhouette",
show = F
)
k <- el$elbow_value_
score <- el$elbow_score_
cat(
paste0(
"Peak of Silhouette Score at K = ",
k,
", Score = ",
score,
" (testing with maxK=",
maxK,
")\n"
)
)
el$fig$savefig(
paste0(filebase,
".PC",
pc_num,
".maxK",
maxK,
".SilhouetteScore.png"),
dpi = 300,
bbox_inches = "tight"
)
######## KMeans Clustering ########
kmeans <- stats::kmeans(data_pc, centers = k, iter.max = 30)
cluster <- data.frame(kmeans[["cluster"]])
cluster$Name <- rownames(cluster)
colnames(cluster) <- c("Cluster", "Name")
data %<>% dplyr::left_join(cluster, by = "Name")
data <- data[c(1:5, ncol(data), 6:(ncol(data) - 1))]
write_tcsv(data, paste0(filebase, ".PC", pc_num, ".KMeans", k, ".xls"))
为了优化选择初始质心的方法,2007 年 Arthur, David, and Sergei Vassilvitskii 三人发表了论文“k-means++: The advantages of careful seeding,sklearn.cluster.KMeans 中默认参数为 init='k-means++' ,其算法原理为在初始化簇中心时,逐个选取 k 个簇中心,且离其他簇中心越远的样本越有可能被选为下个簇中心。