clusteringwithiris

% Load the iris dataset
load fisheriris;  % 'meas' contains the features, 'species' contains the labels

% Show the first few rows of the dataset
disp('First few rows of the dataset:');
disp(meas(1:5,:));

% Standardize the dataset
meas_scaled = zscore(meas);

% Display the first few rows of the scaled dataset
disp('Scaled Data:');
disp(meas_scaled(1:5,:));

% Perform K-Means clustering
k = 3; % Number of clusters
[idx_kmeans, C] = kmeans(meas_scaled,k);

% Plot the K-Means clustering results (using only the first two features)
figure;
gscatter(meas_scaled(:,1), meas_scaled(:,2), idx_kmeans, 'rgb', 'xo*');
hold on;
plot(C(:,1), C(:,2), 'kx', 'MarkerSize', 15, 'LineWidth', 3);
title('K-Means Clustering (Scaled Data)');
xlabel('Feature 1 (Standardized)');
ylabel('Feature 2 (Standardized)');
legend('Cluster 1', 'Cluster 2', 'Cluster 3', 'Centroids');
hold off;
saveas(gcf, 'KMeans_Clustering_Figure.png');  % Save the figure


% Hierarchical clustering
Z = linkage(meas_scaled, 'ward');

% Plot the Hierarchical clustering dendrogram
figure;
dendrogram(Z);
title('Hierarchical Clustering Dendrogram');
xlabel('Sample Index');
ylabel('Distance');
saveas(gcf, 'Hierarchical_Clustering_Figure.png');  % Save the figure


% DBSCAN clustering
epsilon = 0.5;
minPts = 5;
[idx_dbscan, corepts] = dbscan(meas_scaled, epsilon, minPts);

% Plot the DBSCAN clustering results (using only the first two features)
figure;
gscatter(meas_scaled(:,1), meas_scaled(:,2), idx_dbscan, 'rgb', 'xo*');
title('DBSCAN Clustering');
xlabel('Feature 1 (Standardized)');
ylabel('Feature 2 (Standardized)');
saveas(gcf, 'DBSCAN_Clustering_Figure.png');  % Save the figure

% Silhouette scores
silhouette_score_kmeans = silhouette(meas_scaled, idx_kmeans);
fprintf('Average Silhouette Score for K-Means: %.2f\n', mean(silhouette_score_kmeans));
silhouette_score_dbscan = silhouette(meas_scaled, idx_dbscan);
fprintf('Average Silhouette Score for DBSCAN: %.2f\n', mean(silhouette_score_dbscan));

% Davies-Bouldin Index
db_index_kmeans = daviesbouldin(meas_scaled, idx_kmeans);
fprintf('Davies-Bouldin Index for K-Means: %.2f\n', db_index_kmeans);
db_index_dbscan = daviesbouldin(meas_scaled, idx_dbscan);
fprintf('Davies-Bouldin Index for DBSCAN: %.2f\n', db_index_dbscan);

% Adjusted Rand Index
true_labels = grp2idx(species);  % Convert species labels to numeric
ari_kmeans = rand_index(true_labels, idx_kmeans);
fprintf('Adjusted Rand Index for K-Means: %.2f\n', ari_kmeans);
ari_dbscan = rand_index(true_labels, idx_dbscan);
fprintf('Adjusted Rand Index for DBSCAN: %.2f\n', ari_dbscan);

% Compare clustering algorithms
figure;
subplot(2,2,1);
gscatter(meas_scaled(:,1), meas_scaled(:,2), idx_kmeans, 'rgb', 'xo*');
title('K-Means Clustering');
subplot(2,2,2);
gscatter(meas_scaled(:,1), meas_scaled(:,2), clusterdata(meas_scaled, 'linkage', 'ward', 'maxclust', 3), 'rgb', 'xo*'); % Added maxclust
title('Hierarchical Clustering');
subplot(2,2,3);
gscatter(meas_scaled(:,1), meas_scaled(:,2), idx_dbscan, 'rgb', 'xo*');
title('DBSCAN Clustering');
saveas(gcf, 'Clustering_Comparison_Figure.png');  % Save the figure

% Save the clustering results
writetable(table(meas_scaled, idx_kmeans), 'KMeans_Clustering_Results.csv');
writetable(table(meas_scaled, idx_dbscan), 'DBSCAN_Clustering_Results.csv');

% ---- Custom Rand Index Function ----
function RI = rand_index(trueLabels, predictedLabels)
    % trueLabels: The true cluster labels (ground truth)
    % predictedLabels: The predicted cluster labels from the clustering algorithm

    % Ensure both trueLabels and predictedLabels are column vectors
    trueLabels = trueLabels(:);
    predictedLabels = predictedLabels(:);

    % Get the number of data points
    n = length(trueLabels);

    % Initialize A and B (counts of pairwise agreements)
    A = 0;
    B = 0;

    % Count the number of pairs that are either both in the same cluster or both in different clusters
    for i = 1:n-1
        for j = i+1:n
            if (trueLabels(i) == trueLabels(j)) && (predictedLabels(i) == predictedLabels(j))
                A = A + 1; % Same in both
            elseif (trueLabels(i) ~= trueLabels(j)) && (predictedLabels(i) ~= predictedLabels(j))
                B = B + 1; % Different in both
            end
        end
    end

    % Total number of pairs
    totalPairs = n * (n - 1) / 2;

    % Calculate the Rand Index
    RI = (A + B) / totalPairs;
end

% ---- Davies-Bouldin Index Function ----
function db_index = daviesbouldin(X, idx)
    % X: The data matrix
    % idx: The predicted labels from the clustering algorithm

    % Number of clusters
    numClusters = length(unique(idx));

    % Initialize variables for calculating Davies-Bouldin index
    DB = 0;

    % Loop through all clusters and compute the Davies-Bouldin index
    for i = 1:numClusters
        % Data points in the i-th cluster
        cluster_i = X(idx == i, :);

        % Centroid of the i-th cluster
        centroid_i = mean(cluster_i, 1);

        % Compute the mean distance from the centroid of i-th cluster to each point in the cluster
        Si = mean(pdist2(cluster_i, centroid_i));

        % Initialize maximum similarity for Davies-Bouldin
        max_similarity = 0;

        % Compare with every other cluster j
        for j = 1:numClusters
            if i ~= j
                % Data points in the j-th cluster
                cluster_j = X(idx == j, :);

                % Centroid of the j-th cluster
                centroid_j = mean(cluster_j, 1);

                % Compute the mean distance from the centroid of j-th cluster to each point in the cluster
                Sj = mean(pdist2(cluster_j, centroid_j));

                % Compute the distance between centroids of i-th and j-th clusters
                Dij = pdist2(centroid_i, centroid_j);

                % Calculate the similarity and update max_similarity
                similarity = (Si + Sj) / Dij;
                max_similarity = max(max_similarity, similarity);
            end
        end

        % Add the max similarity for the i-th cluster to the overall Davies-Bouldin index
        DB = DB + max_similarity;
    end

    % Calculate the average Davies-Bouldin index
    db_index = DB / numClusters;
end

First few rows of the dataset:
    5.1000    3.5000    1.4000    0.2000
    4.9000    3.0000    1.4000    0.2000
    4.7000    3.2000    1.3000    0.2000
    4.6000    3.1000    1.5000    0.2000
    5.0000    3.6000    1.4000    0.2000

Scaled Data:
   -0.8977    1.0156   -1.3358   -1.3111
   -1.1392   -0.1315   -1.3358   -1.3111
   -1.3807    0.3273   -1.3924   -1.3111
   -1.5015    0.0979   -1.2791   -1.3111
   -1.0184    1.2450   -1.3358   -1.3111

Average Silhouette Score for K-Means: 0.65
Average Silhouette Score for DBSCAN: 0.47
Davies-Bouldin Index for K-Means: 0.83
Davies-Bouldin Index for DBSCAN: 0.33
Adjusted Rand Index for K-Means: 0.82
Adjusted Rand Index for DBSCAN: 0.75