基于改进人工蜂群算法的K均值聚类算法

算法核心思想

传统K均值聚类的局限性

  • 对初始中心点敏感:容易陷入局部最优
  • 收敛速度慢:需要多次迭代才能稳定
  • 鲁棒性差:对噪声和异常值敏感

改进人工蜂群算法的优势

改进点 传统ABC 改进ABC
初始化策略 随机初始化 基于最大最小距离的智能初始化
搜索方程 简单随机搜索 自适应搜索+精英引导
收敛控制 固定参数 动态调整搜索步长
局部开发 基础局部搜索 结合K均值局部优化

MATLAB完整实现

1. 主函数框架

function [best_centers, best_labels, best_fitness, convergence_curve] = ...
         improved_abc_kmeans(data, k, n_employed, n_onlookers, max_iter)
% 基于改进人工蜂群算法的K均值聚类
% 输入参数:
%   data: n×d数据矩阵 (n个样本,d个维度)
%   k: 聚类数量
%   n_employed: 雇佣蜂数量
%   n_onlookers: 观察蜂数量  
%   max_iter: 最大迭代次数

    [n_samples, n_dims] = size(data);
    
    % 参数初始化
    limit = round(n_employed * n_dims * 0.5); % 放弃阈值
    trial_counter = zeros(1, n_employed);     % 失败计数器
    
    % 改进的智能初始化
    food_sources = initialize_smart_sources(data, k, n_employed);
    
    % 计算初始适应度
    fitness_values = zeros(1, n_employed);
    for i = 1:n_employed
        centers = reshape(food_sources(i, :), k, n_dims);
        [~, fitness_values(i)] = kmeans_objective(data, centers);
    end
    
    % 记录最优解
    [best_fitness, best_idx] = min(fitness_values);
    best_centers = reshape(food_sources(best_idx, :), k, n_dims);
    [~, best_labels] = kmeans_objective(data, best_centers);
    
    convergence_curve = zeros(1, max_iter);
    
    % 主循环
    for iter = 1:max_iter
        %% 雇佣蜂阶段 - 改进的搜索策略
        for i = 1:n_employed
            % 自适应选择邻居
            if rand() < 0.7
                % 精英引导搜索
                neighbor_idx = select_elite_neighbor(fitness_values, i);
            else
                % 随机搜索
                neighbor_idx = select_random_neighbor(n_employed, i);
            end
            
            % 生成新食物源 - 自适应步长
            new_source = generate_new_source(food_sources(i, :), ...
                                           food_sources(neighbor_idx, :), ...
                                           best_centers(:)', iter, max_iter);
            
            % 评估新解
            new_centers = reshape(new_source, k, n_dims);
            [~, new_fitness] = kmeans_objective(data, new_centers);
            
            % 贪婪选择
            if new_fitness < fitness_values(i)
                food_sources(i, :) = new_source;
                fitness_values(i) = new_fitness;
                trial_counter(i) = 0;
            else
                trial_counter(i) = trial_counter(i) + 1;
            end
        end
        
        %% 观察蜂阶段 - 基于轮盘赌选择
        fitness_probs = calculate_selection_probability(fitness_values);
        
        for i = 1:n_onlookers
            % 选择食物源
            selected_idx = roulette_wheel_selection(fitness_probs);
            
            % 局部精细搜索(结合K均值)
            improved_source = local_refinement_search(food_sources(selected_idx, :), ...
                                                    data, k, iter, max_iter);
            
            % 评估改进解
            improved_centers = reshape(improved_source, k, n_dims);
            [~, improved_fitness] = kmeans_objective(data, improved_centers);
            
            % 更新解
            if improved_fitness < fitness_values(selected_idx)
                food_sources(selected_idx, :) = improved_source;
                fitness_values(selected_idx) = improved_fitness;
                trial_counter(selected_idx) = 0;
            else
                trial_counter(selected_idx) = trial_counter(selected_idx) + 1;
            end
        end
        
        %% 侦察蜂阶段
        for i = 1:n_employed
            if trial_counter(i) > limit
                % 重新初始化失败的食物源
                food_sources(i, :) = initialize_single_source(data, k);
                centers = reshape(food_sources(i, :), k, n_dims);
                [~, fitness_values(i)] = kmeans_objective(data, centers);
                trial_counter(i) = 0;
            end
        end
        
        %% 更新全局最优解
        [current_best_fitness, current_best_idx] = min(fitness_values);
        if current_best_fitness < best_fitness
            best_fitness = current_best_fitness;
            best_centers = reshape(food_sources(current_best_idx, :), k, n_dims);
            [~, best_labels] = kmeans_objective(data, best_centers);
        end
        
        convergence_curve(iter) = best_fitness;
        
        % 显示进度
        if mod(iter, 50) == 0
            fprintf('迭代 %d, 最佳适应度: %f\n', iter, best_fitness);
        end
    end
end

2. 核心组件函数

function sources = initialize_smart_sources(data, k, n_sources)
% 基于最大最小距离的智能初始化
    [n_samples, n_dims] = size(data);
    sources = zeros(n_sources, k * n_dims);
    
    for i = 1:n_sources
        % 随机选择第一个中心
        centers = zeros(k, n_dims);
        first_idx = randi(n_samples);
        centers(1, :) = data(first_idx, :);
        
        % 基于最大最小距离选择后续中心
        for j = 2:k
            distances = pdist2(data, centers(1:j-1, :));
            min_distances = min(distances, [], 2);
            [~, new_center_idx] = max(min_distances);
            centers(j, :) = data(new_center_idx, :);
        end
        
        sources(i, :) = centers(:)';
    end
end

function [labels, fitness] = kmeans_objective(data, centers)
% K均值目标函数 - 最小化类内距离和
    k = size(centers, 1);
    
    % 分配标签
    distances = pdist2(data, centers);
    [~, labels] = min(distances, [], 2);
    
    % 计算类内距离和
    fitness = 0;
    for i = 1:k
        cluster_points = data(labels == i, :);
        if ~isempty(cluster_points)
            cluster_center = centers(i, :);
            fitness = fitness + sum(pdist2(cluster_points, cluster_center).^2);
        end
    end
end

function new_source = generate_new_source(current_source, neighbor_source, ...
                                        best_source, iter, max_iter)
% 自适应生成新食物源
    n_dims = length(current_source);
    
    % 自适应步长因子
    alpha = 1.0 - 0.8 * (iter / max_iter);  % 线性递减
    beta = 0.5 * (iter / max_iter);         % 线性递增
    
    % 改进的搜索方程:结合当前解、邻居解和全局最优解
    phi = 2 * rand(1, n_dims) - 1;  % [-1, 1]随机数
    
    new_source = current_source + alpha * phi .* (current_source - neighbor_source) ...
                + beta * phi .* (best_source - current_source);
    
    % 边界处理
    lb = min(current_source) * 0.8;
    ub = max(current_source) * 1.2;
    new_source = max(new_source, lb);
    new_source = min(new_source, ub);
end

function improved_source = local_refinement_search(source, data, k, iter, max_iter)
% 局部精细搜索 - 结合K均值思想
    centers = reshape(source, k, size(data, 2));
    
    % 执行少量K均值迭代进行局部优化
    n_local_iter = max(1, round(3 * (1 - iter/max_iter)));  % 自适应迭代次数
    
    for local_iter = 1:n_local_iter
        % E步:分配标签
        distances = pdist2(data, centers);
        [~, labels] = min(distances, [], 2);
        
        % M步:更新中心
        new_centers = zeros(k, size(data, 2));
        for i = 1:k
            cluster_points = data(labels == i, :);
            if ~isempty(cluster_points)
                new_centers(i, :) = mean(cluster_points, 1);
            else
                new_centers(i, :) = centers(i, :);  % 保持原中心
            end
        end
        centers = new_centers;
    end
    
    improved_source = centers(:)';
end

function probs = calculate_selection_probability(fitness_values)
% 计算选择概率 - 基于适应度的改进版本
    max_fit = max(fitness_values);
    min_fit = min(fitness_values);
    
    if max_fit == min_fit
        probs = ones(size(fitness_values)) / length(fitness_values);
    else
        % 归一化适应度(最小化问题)
        normalized_fitness = (max_fit - fitness_values) / (max_fit - min_fit + 1e-10);
        probs = normalized_fitness / sum(normalized_fitness);
    end
end

function selected_idx = roulette_wheel_selection(probabilities)
% 轮盘赌选择
    cumulative_probs = cumsum(probabilities);
    r = rand();
    selected_idx = find(cumulative_probs >= r, 1);
end

function neighbor_idx = select_elite_neighbor(fitness_values, current_idx)
% 精英邻居选择策略
    [~, elite_indices] = mink(fitness_values, 3);  % 选择前3个最优解
    elite_indices = setdiff(elite_indices, current_idx);  % 排除当前解
    
    if isempty(elite_indices)
        neighbor_idx = select_random_neighbor(length(fitness_values), current_idx);
    else
        neighbor_idx = elite_indices(randi(length(elite_indices)));
    end
end

function neighbor_idx = select_random_neighbor(n_total, current_idx)
% 随机邻居选择
    candidates = setdiff(1:n_total, current_idx);
    neighbor_idx = candidates(randi(length(candidates)));
end

3. 测试与验证函数

function test_improved_abc_kmeans()
% 测试改进的ABC-Kmeans算法
    
    % 生成测试数据
    rng(42);  % 设置随机种子保证可重复性
    
    % 三个高斯分布的簇
    data1 = mvnrnd([2, 2], [0.5, 0.3; 0.3, 0.5], 100);
    data2 = mvnrnd([-2, -2], [0.6, -0.2; -0.2, 0.6], 100);
    data3 = mvnrnd([0, 3], [0.4, 0.1; 0.1, 0.8], 100);
    
    data = [data1; data2; data3];
    true_labels = [ones(100,1); 2*ones(100,1); 3*ones(100,1)];
    
    % 算法参数
    k = 3;
    n_employed = 20;
    n_onlookers = 20;
    max_iter = 200;
    
    % 运行改进ABC-Kmeans
    tic;
    [best_centers, pred_labels, best_fitness, convergence] = ...
        improved_abc_kmeans(data, k, n_employed, n_onlookers, max_iter);
    time_elapsed = toc;
    
    % 运行传统Kmeans对比
    tic;
    [kmeans_labels, kmeans_centers] = kmeans(data, k, 'Replicates', 10);
    kmeans_time = toc;
    
    % 计算性能指标
    abc_accuracy = calculate_clustering_accuracy(true_labels, pred_labels);
    kmeans_accuracy = calculate_clustering_accuracy(true_labels, kmeans_labels);
    
    abc_silhouette = mean(silhouette(data, pred_labels));
    kmeans_silhouette = mean(silhouette(data, kmeans_labels));
    
    % 显示结果
    fprintf('=== 算法性能比较 ===\n');
    fprintf('改进ABC-Kmeans:\n');
    fprintf('  适应度值: %.4f\n', best_fitness);
    fprintf('  准确率: %.2f%%\n', abc_accuracy * 100);
    fprintf('  轮廓系数: %.4f\n', abc_silhouette);
    fprintf('  运行时间: %.2f秒\n', time_elapsed);
    
    fprintf('\n传统Kmeans:\n');
    fprintf('  准确率: %.2f%%\n', kmeans_accuracy * 100);
    fprintf('  轮廓系数: %.4f\n', kmeans_silhouette);
    fprintf('  运行时间: %.2f秒\n', kmeans_time);
    
    % 可视化结果
    visualize_results(data, true_labels, pred_labels, kmeans_labels, ...
                     best_centers, kmeans_centers, convergence);
end

function accuracy = calculate_clustering_accuracy(true_labels, pred_labels)
% 计算聚类准确率(需要最佳标签匹配)
    k = max(true_labels);
    best_accuracy = 0;
    
    % 尝试所有可能的标签排列
    permutations = perms(1:k);
    
    for i = 1:size(permutations, 1)
        remapped_labels = zeros(size(pred_labels));
        for j = 1:k
            remapped_labels(pred_labels == j) = permutations(i, j);
        end
        current_accuracy = sum(remapped_labels == true_labels) / length(true_labels);
        best_accuracy = max(best_accuracy, current_accuracy);
    end
    
    accuracy = best_accuracy;
end

function visualize_results(data, true_labels, abc_labels, kmeans_labels, ...
                          abc_centers, kmeans_centers, convergence)
% 可视化聚类结果和收敛曲线
    
    figure('Position', [100, 100, 1200, 800]);
    
    % 子图1: 真实标签
    subplot(2, 3, 1);
    gscatter(data(:,1), data(:,2), true_labels);
    title('真实聚类分布');
    legend('位置', 'northeastoutside');
    
    % 子图2: ABC-Kmeans结果
    subplot(2, 3, 2);
    gscatter(data(:,1), data(:,2), abc_labels);
    hold on;
    plot(abc_centers(:,1), abc_centers(:,2), 'kx', 'MarkerSize', 12, 'LineWidth', 3);
    title('改进ABC-Kmeans聚类结果');
    legend('位置', 'northeastoutside');
    
    % 子图3: 传统Kmeans结果
    subplot(2, 3, 3);
    gscatter(data(:,1), data(:,2), kmeans_labels);
    hold on;
    plot(kmeans_centers(:,1), kmeans_centers(:,2), 'kx', 'MarkerSize', 12, 'LineWidth', 3);
    title('传统Kmeans聚类结果');
    legend('位置', 'northeastoutside');
    
    % 子图4: 收敛曲线
    subplot(2, 3, 4);
    plot(convergence, 'LineWidth', 2);
    xlabel('迭代次数');
    ylabel('适应度值');
    title('改进ABC-Kmeans收敛曲线');
    grid on;
    
    % 子图5: 轮廓系数对比
    subplot(2, 3, 5);
    algorithms = {'ABC-Kmeans', 'Kmeans'};
    silhouette_scores = [mean(silhouette(data, abc_labels)), ...
                        mean(silhouette(data, kmeans_labels))];
    bar(silhouette_scores);
    set(gca, 'XTickLabel', algorithms);
    ylabel('平均轮廓系数');
    title('聚类质量对比');
    
    % 子图6: 运行时间对比
    subplot(2, 3, 6);
    % 这里需要实际测量运行时间
    times = [1.2, 0.8];  % 示例值,实际使用时需要真实测量
    bar(times);
    set(gca, 'XTickLabel', algorithms);
    ylabel('运行时间(秒)');
    title('计算效率对比');
end

参考代码 基于改进人工蜂群算法的K均值聚类算法 www.3dddown.com/cna/81318.html

性能分析与改进方向

算法优势对比

指标 传统K均值 改进ABC-K均值
收敛精度 中等 高(避免局部最优)
稳定性 低(依赖初始化) 高(智能初始化)
收敛速度 快但质量不稳定 稳定收敛到优质解
鲁棒性 对噪声敏感 对噪声和异常值更鲁棒

关键改进点总结

  1. 智能初始化:基于最大最小距离的初始化策略
  2. 自适应搜索:动态调整搜索步长和方向
  3. 精英引导:利用优质解指导搜索过程
  4. 局部优化:结合K均值的局部开发能力
  5. 混合策略:全局探索与局部开发的平衡

参数调优建议

% 推荐参数设置
parameters = struct();
parameters.n_employed = 20;      % 雇佣蜂数量
parameters.n_onlookers = 20;     % 观察蜂数量  
parameters.max_iter = 200;       % 最大迭代次数
parameters.limit_factor = 0.5;   % 放弃阈值因子

% 针对不同问题规模的调整
if size(data, 1) > 1000
    parameters.n_employed = 30;
    parameters.n_onlookers = 30;
    parameters.max_iter = 300;
end
posted @ 2025-12-12 11:18  yes_go  阅读(6)  评论(0)    收藏  举报