算法核心思想
传统K均值聚类的局限性
- 对初始中心点敏感:容易陷入局部最优
- 收敛速度慢:需要多次迭代才能稳定
- 鲁棒性差:对噪声和异常值敏感
改进人工蜂群算法的优势
| 改进点 |
传统ABC |
改进ABC |
| 初始化策略 |
随机初始化 |
基于最大最小距离的智能初始化 |
| 搜索方程 |
简单随机搜索 |
自适应搜索+精英引导 |
| 收敛控制 |
固定参数 |
动态调整搜索步长 |
| 局部开发 |
基础局部搜索 |
结合K均值局部优化 |
MATLAB完整实现
1. 主函数框架
function [best_centers, best_labels, best_fitness, convergence_curve] = ...
improved_abc_kmeans(data, k, n_employed, n_onlookers, max_iter)
% 基于改进人工蜂群算法的K均值聚类
% 输入参数:
% data: n×d数据矩阵 (n个样本,d个维度)
% k: 聚类数量
% n_employed: 雇佣蜂数量
% n_onlookers: 观察蜂数量
% max_iter: 最大迭代次数
[n_samples, n_dims] = size(data);
% 参数初始化
limit = round(n_employed * n_dims * 0.5); % 放弃阈值
trial_counter = zeros(1, n_employed); % 失败计数器
% 改进的智能初始化
food_sources = initialize_smart_sources(data, k, n_employed);
% 计算初始适应度
fitness_values = zeros(1, n_employed);
for i = 1:n_employed
centers = reshape(food_sources(i, :), k, n_dims);
[~, fitness_values(i)] = kmeans_objective(data, centers);
end
% 记录最优解
[best_fitness, best_idx] = min(fitness_values);
best_centers = reshape(food_sources(best_idx, :), k, n_dims);
[~, best_labels] = kmeans_objective(data, best_centers);
convergence_curve = zeros(1, max_iter);
% 主循环
for iter = 1:max_iter
%% 雇佣蜂阶段 - 改进的搜索策略
for i = 1:n_employed
% 自适应选择邻居
if rand() < 0.7
% 精英引导搜索
neighbor_idx = select_elite_neighbor(fitness_values, i);
else
% 随机搜索
neighbor_idx = select_random_neighbor(n_employed, i);
end
% 生成新食物源 - 自适应步长
new_source = generate_new_source(food_sources(i, :), ...
food_sources(neighbor_idx, :), ...
best_centers(:)', iter, max_iter);
% 评估新解
new_centers = reshape(new_source, k, n_dims);
[~, new_fitness] = kmeans_objective(data, new_centers);
% 贪婪选择
if new_fitness < fitness_values(i)
food_sources(i, :) = new_source;
fitness_values(i) = new_fitness;
trial_counter(i) = 0;
else
trial_counter(i) = trial_counter(i) + 1;
end
end
%% 观察蜂阶段 - 基于轮盘赌选择
fitness_probs = calculate_selection_probability(fitness_values);
for i = 1:n_onlookers
% 选择食物源
selected_idx = roulette_wheel_selection(fitness_probs);
% 局部精细搜索(结合K均值)
improved_source = local_refinement_search(food_sources(selected_idx, :), ...
data, k, iter, max_iter);
% 评估改进解
improved_centers = reshape(improved_source, k, n_dims);
[~, improved_fitness] = kmeans_objective(data, improved_centers);
% 更新解
if improved_fitness < fitness_values(selected_idx)
food_sources(selected_idx, :) = improved_source;
fitness_values(selected_idx) = improved_fitness;
trial_counter(selected_idx) = 0;
else
trial_counter(selected_idx) = trial_counter(selected_idx) + 1;
end
end
%% 侦察蜂阶段
for i = 1:n_employed
if trial_counter(i) > limit
% 重新初始化失败的食物源
food_sources(i, :) = initialize_single_source(data, k);
centers = reshape(food_sources(i, :), k, n_dims);
[~, fitness_values(i)] = kmeans_objective(data, centers);
trial_counter(i) = 0;
end
end
%% 更新全局最优解
[current_best_fitness, current_best_idx] = min(fitness_values);
if current_best_fitness < best_fitness
best_fitness = current_best_fitness;
best_centers = reshape(food_sources(current_best_idx, :), k, n_dims);
[~, best_labels] = kmeans_objective(data, best_centers);
end
convergence_curve(iter) = best_fitness;
% 显示进度
if mod(iter, 50) == 0
fprintf('迭代 %d, 最佳适应度: %f\n', iter, best_fitness);
end
end
end
2. 核心组件函数
function sources = initialize_smart_sources(data, k, n_sources)
% 基于最大最小距离的智能初始化
[n_samples, n_dims] = size(data);
sources = zeros(n_sources, k * n_dims);
for i = 1:n_sources
% 随机选择第一个中心
centers = zeros(k, n_dims);
first_idx = randi(n_samples);
centers(1, :) = data(first_idx, :);
% 基于最大最小距离选择后续中心
for j = 2:k
distances = pdist2(data, centers(1:j-1, :));
min_distances = min(distances, [], 2);
[~, new_center_idx] = max(min_distances);
centers(j, :) = data(new_center_idx, :);
end
sources(i, :) = centers(:)';
end
end
function [labels, fitness] = kmeans_objective(data, centers)
% K均值目标函数 - 最小化类内距离和
k = size(centers, 1);
% 分配标签
distances = pdist2(data, centers);
[~, labels] = min(distances, [], 2);
% 计算类内距离和
fitness = 0;
for i = 1:k
cluster_points = data(labels == i, :);
if ~isempty(cluster_points)
cluster_center = centers(i, :);
fitness = fitness + sum(pdist2(cluster_points, cluster_center).^2);
end
end
end
function new_source = generate_new_source(current_source, neighbor_source, ...
best_source, iter, max_iter)
% 自适应生成新食物源
n_dims = length(current_source);
% 自适应步长因子
alpha = 1.0 - 0.8 * (iter / max_iter); % 线性递减
beta = 0.5 * (iter / max_iter); % 线性递增
% 改进的搜索方程:结合当前解、邻居解和全局最优解
phi = 2 * rand(1, n_dims) - 1; % [-1, 1]随机数
new_source = current_source + alpha * phi .* (current_source - neighbor_source) ...
+ beta * phi .* (best_source - current_source);
% 边界处理
lb = min(current_source) * 0.8;
ub = max(current_source) * 1.2;
new_source = max(new_source, lb);
new_source = min(new_source, ub);
end
function improved_source = local_refinement_search(source, data, k, iter, max_iter)
% 局部精细搜索 - 结合K均值思想
centers = reshape(source, k, size(data, 2));
% 执行少量K均值迭代进行局部优化
n_local_iter = max(1, round(3 * (1 - iter/max_iter))); % 自适应迭代次数
for local_iter = 1:n_local_iter
% E步:分配标签
distances = pdist2(data, centers);
[~, labels] = min(distances, [], 2);
% M步:更新中心
new_centers = zeros(k, size(data, 2));
for i = 1:k
cluster_points = data(labels == i, :);
if ~isempty(cluster_points)
new_centers(i, :) = mean(cluster_points, 1);
else
new_centers(i, :) = centers(i, :); % 保持原中心
end
end
centers = new_centers;
end
improved_source = centers(:)';
end
function probs = calculate_selection_probability(fitness_values)
% 计算选择概率 - 基于适应度的改进版本
max_fit = max(fitness_values);
min_fit = min(fitness_values);
if max_fit == min_fit
probs = ones(size(fitness_values)) / length(fitness_values);
else
% 归一化适应度(最小化问题)
normalized_fitness = (max_fit - fitness_values) / (max_fit - min_fit + 1e-10);
probs = normalized_fitness / sum(normalized_fitness);
end
end
function selected_idx = roulette_wheel_selection(probabilities)
% 轮盘赌选择
cumulative_probs = cumsum(probabilities);
r = rand();
selected_idx = find(cumulative_probs >= r, 1);
end
function neighbor_idx = select_elite_neighbor(fitness_values, current_idx)
% 精英邻居选择策略
[~, elite_indices] = mink(fitness_values, 3); % 选择前3个最优解
elite_indices = setdiff(elite_indices, current_idx); % 排除当前解
if isempty(elite_indices)
neighbor_idx = select_random_neighbor(length(fitness_values), current_idx);
else
neighbor_idx = elite_indices(randi(length(elite_indices)));
end
end
function neighbor_idx = select_random_neighbor(n_total, current_idx)
% 随机邻居选择
candidates = setdiff(1:n_total, current_idx);
neighbor_idx = candidates(randi(length(candidates)));
end
3. 测试与验证函数
function test_improved_abc_kmeans()
% 测试改进的ABC-Kmeans算法
% 生成测试数据
rng(42); % 设置随机种子保证可重复性
% 三个高斯分布的簇
data1 = mvnrnd([2, 2], [0.5, 0.3; 0.3, 0.5], 100);
data2 = mvnrnd([-2, -2], [0.6, -0.2; -0.2, 0.6], 100);
data3 = mvnrnd([0, 3], [0.4, 0.1; 0.1, 0.8], 100);
data = [data1; data2; data3];
true_labels = [ones(100,1); 2*ones(100,1); 3*ones(100,1)];
% 算法参数
k = 3;
n_employed = 20;
n_onlookers = 20;
max_iter = 200;
% 运行改进ABC-Kmeans
tic;
[best_centers, pred_labels, best_fitness, convergence] = ...
improved_abc_kmeans(data, k, n_employed, n_onlookers, max_iter);
time_elapsed = toc;
% 运行传统Kmeans对比
tic;
[kmeans_labels, kmeans_centers] = kmeans(data, k, 'Replicates', 10);
kmeans_time = toc;
% 计算性能指标
abc_accuracy = calculate_clustering_accuracy(true_labels, pred_labels);
kmeans_accuracy = calculate_clustering_accuracy(true_labels, kmeans_labels);
abc_silhouette = mean(silhouette(data, pred_labels));
kmeans_silhouette = mean(silhouette(data, kmeans_labels));
% 显示结果
fprintf('=== 算法性能比较 ===\n');
fprintf('改进ABC-Kmeans:\n');
fprintf(' 适应度值: %.4f\n', best_fitness);
fprintf(' 准确率: %.2f%%\n', abc_accuracy * 100);
fprintf(' 轮廓系数: %.4f\n', abc_silhouette);
fprintf(' 运行时间: %.2f秒\n', time_elapsed);
fprintf('\n传统Kmeans:\n');
fprintf(' 准确率: %.2f%%\n', kmeans_accuracy * 100);
fprintf(' 轮廓系数: %.4f\n', kmeans_silhouette);
fprintf(' 运行时间: %.2f秒\n', kmeans_time);
% 可视化结果
visualize_results(data, true_labels, pred_labels, kmeans_labels, ...
best_centers, kmeans_centers, convergence);
end
function accuracy = calculate_clustering_accuracy(true_labels, pred_labels)
% 计算聚类准确率(需要最佳标签匹配)
k = max(true_labels);
best_accuracy = 0;
% 尝试所有可能的标签排列
permutations = perms(1:k);
for i = 1:size(permutations, 1)
remapped_labels = zeros(size(pred_labels));
for j = 1:k
remapped_labels(pred_labels == j) = permutations(i, j);
end
current_accuracy = sum(remapped_labels == true_labels) / length(true_labels);
best_accuracy = max(best_accuracy, current_accuracy);
end
accuracy = best_accuracy;
end
function visualize_results(data, true_labels, abc_labels, kmeans_labels, ...
abc_centers, kmeans_centers, convergence)
% 可视化聚类结果和收敛曲线
figure('Position', [100, 100, 1200, 800]);
% 子图1: 真实标签
subplot(2, 3, 1);
gscatter(data(:,1), data(:,2), true_labels);
title('真实聚类分布');
legend('位置', 'northeastoutside');
% 子图2: ABC-Kmeans结果
subplot(2, 3, 2);
gscatter(data(:,1), data(:,2), abc_labels);
hold on;
plot(abc_centers(:,1), abc_centers(:,2), 'kx', 'MarkerSize', 12, 'LineWidth', 3);
title('改进ABC-Kmeans聚类结果');
legend('位置', 'northeastoutside');
% 子图3: 传统Kmeans结果
subplot(2, 3, 3);
gscatter(data(:,1), data(:,2), kmeans_labels);
hold on;
plot(kmeans_centers(:,1), kmeans_centers(:,2), 'kx', 'MarkerSize', 12, 'LineWidth', 3);
title('传统Kmeans聚类结果');
legend('位置', 'northeastoutside');
% 子图4: 收敛曲线
subplot(2, 3, 4);
plot(convergence, 'LineWidth', 2);
xlabel('迭代次数');
ylabel('适应度值');
title('改进ABC-Kmeans收敛曲线');
grid on;
% 子图5: 轮廓系数对比
subplot(2, 3, 5);
algorithms = {'ABC-Kmeans', 'Kmeans'};
silhouette_scores = [mean(silhouette(data, abc_labels)), ...
mean(silhouette(data, kmeans_labels))];
bar(silhouette_scores);
set(gca, 'XTickLabel', algorithms);
ylabel('平均轮廓系数');
title('聚类质量对比');
% 子图6: 运行时间对比
subplot(2, 3, 6);
% 这里需要实际测量运行时间
times = [1.2, 0.8]; % 示例值,实际使用时需要真实测量
bar(times);
set(gca, 'XTickLabel', algorithms);
ylabel('运行时间(秒)');
title('计算效率对比');
end
参考代码 基于改进人工蜂群算法的K均值聚类算法 www.3dddown.com/cna/81318.html
性能分析与改进方向
算法优势对比
| 指标 |
传统K均值 |
改进ABC-K均值 |
| 收敛精度 |
中等 |
高(避免局部最优) |
| 稳定性 |
低(依赖初始化) |
高(智能初始化) |
| 收敛速度 |
快但质量不稳定 |
稳定收敛到优质解 |
| 鲁棒性 |
对噪声敏感 |
对噪声和异常值更鲁棒 |
关键改进点总结
- 智能初始化:基于最大最小距离的初始化策略
- 自适应搜索:动态调整搜索步长和方向
- 精英引导:利用优质解指导搜索过程
- 局部优化:结合K均值的局部开发能力
- 混合策略:全局探索与局部开发的平衡
参数调优建议
% 推荐参数设置
parameters = struct();
parameters.n_employed = 20; % 雇佣蜂数量
parameters.n_onlookers = 20; % 观察蜂数量
parameters.max_iter = 200; % 最大迭代次数
parameters.limit_factor = 0.5; % 放弃阈值因子
% 针对不同问题规模的调整
if size(data, 1) > 1000
parameters.n_employed = 30;
parameters.n_onlookers = 30;
parameters.max_iter = 300;
end