| Line | Hits | Source |
|---|---|---|
| 1 | /* | |
| 2 | * Copyright (c) 2004, the JUNG Project and the Regents of the University | |
| 3 | * of California | |
| 4 | * All rights reserved. | |
| 5 | * | |
| 6 | * This software is open-source under the BSD license; see either | |
| 7 | * "license.txt" or | |
| 8 | * http://jung.sourceforge.net/license.txt for a description. | |
| 9 | * | |
| 10 | * Created on Aug 12, 2004 | |
| 11 | */ | |
| 12 | package edu.uci.ics.jung.algorithms.cluster; | |
| 13 | ||
| 14 | import java.util.Arrays; | |
| 15 | import java.util.Collection; | |
| 16 | import java.util.Comparator; | |
| 17 | import java.util.HashMap; | |
| 18 | import java.util.HashSet; | |
| 19 | import java.util.Iterator; | |
| 20 | import java.util.LinkedList; | |
| 21 | import java.util.Map; | |
| 22 | import java.util.Set; | |
| 23 | ||
| 24 | import cern.jet.random.engine.DRand; | |
| 25 | import cern.jet.random.engine.RandomEngine; | |
| 26 | import edu.uci.ics.jung.algorithms.cluster.KMeansClusterer.NotEnoughClustersException; | |
| 27 | import edu.uci.ics.jung.algorithms.importance.VoltageRanker; | |
| 28 | import edu.uci.ics.jung.graph.ArchetypeGraph; | |
| 29 | import edu.uci.ics.jung.graph.ArchetypeVertex; | |
| 30 | import edu.uci.ics.jung.graph.Vertex; | |
| 31 | import edu.uci.ics.jung.graph.decorators.UserDatumNumberVertexValue; | |
| 32 | import edu.uci.ics.jung.statistics.DiscreteDistribution; | |
| 33 | ||
| 34 | /** | |
| 35 | * <p>Clusters vertices of a <code>Graph</code> based on their ranks as | |
| 36 | * calculated by <code>VoltageRanker</code>. This algorithm is based on, | |
| 37 | * but not identical with, the method described in the paper below. | |
| 38 | * The primary difference is that Wu and Huberman assume a priori that the clusters | |
| 39 | * are of approximately the same size, and therefore use a more complex | |
| 40 | * method than k-means (which is used here) for determining cluster | |
| 41 | * membership based on co-occurrence data.</p> | |
| 42 | * | |
| 43 | * <p>The algorithm proceeds as follows: | |
| 44 | * <ul> | |
| 45 | * <li/>first, generate a set of candidate clusters as follows: | |
| 46 | * <ul> | |
| 47 | * <li/>pick (widely separated) vertex pair, run VoltageRanker | |
| 48 | * <li/>group the vertices in two clusters according to their voltages | |
| 49 | * <li/>store resulting candidate clusters | |
| 50 | * </ul> | |
| 51 | * <li/>second, generate k-1 clusters as follows: | |
| 52 | * <ul> | |
| 53 | * <li/>pick a vertex v as a cluster 'seed' | |
| 54 | * <br>(Wu/Huberman: most frequent vertex in candidate clusters) | |
| 55 | * <li/>calculate co-occurrence over all candidate clusters of v with each other | |
| 56 | * vertex | |
| 57 | * <li/>separate co-occurrence counts into high/low; | |
| 58 | * high vertices constitute a cluster | |
| 59 | * <li/>remove v's vertices from candidate clusters; continue | |
| 60 | * </ul> | |
| 61 | * <li/>finally, remaining unassigned vertices are assigned to the kth ("garbage") | |
| 62 | * cluster. | |
| 63 | * </ul></p> | |
| 64 | * | |
| 65 | * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into | |
| 66 | * clusters, the number of clusters returned by this algorithm may be less than the | |
| 67 | * number of clusters requested. The number of clusters will never be more than | |
| 68 | * the number requested, however.</p> | |
| 69 | * | |
| 70 | * @author Joshua O'Madadhain | |
| 71 | * @see <a href="http://www.hpl.hp.com/research/idl/papers/linear/">'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman</a> | |
| 72 | * @see VoltageRanker | |
| 73 | * @see KMeansClusterer | |
| 74 | */ | |
| 75 | public class VoltageClusterer | |
| 76 | { | |
| 77 | public final static String VOLTAGE_KEY = "edu.uci.ics.jung.algorithms.cluster.VoltageClusterer.KEY"; | |
| 78 | ||
| 79 | protected int num_candidates; | |
| 80 | protected KMeansClusterer kmc; | |
| 81 | protected UserDatumNumberVertexValue vv; | |
| 82 | protected VoltageRanker vr; | |
| 83 | protected RandomEngine rand; | |
| 84 | ||
| 85 | /** | |
| 86 | * Creates an instance of a VoltageCluster with the specified parameters. | |
| 87 | * These are mostly parameters that are passed directly to VoltageRanker | |
| 88 | * and KMeansClusterer. | |
| 89 | * | |
| 90 | * @param num_candidates the number of candidate clusters to create | |
| 91 | * @param rank_iterations the number of iterations to run VoltageRanker | |
| 92 | * @param cluster_iterations the number of iterations to run KMeansClusterer | |
| 93 | * @param cluster_convergence the convergence value for KMeansClusterer | |
| 94 | */ | |
| 95 | public VoltageClusterer(int num_candidates, int rank_iterations, | |
| 96 | double rank_convergence, int cluster_iterations, double cluster_convergence) | |
| 97 | 0 | { |
| 98 | 0 | if (num_candidates < 1) |
| 99 | 0 | throw new IllegalArgumentException("must generate >=1 candidates"); |
| 100 | ||
| 101 | 0 | this.num_candidates = num_candidates; |
| 102 | 0 | this.vv = new UserDatumNumberVertexValue(VOLTAGE_KEY); |
| 103 | 0 | this.vr = new VoltageRanker(vv, rank_iterations, rank_convergence); |
| 104 | 0 | this.kmc = new KMeansClusterer(cluster_iterations, cluster_convergence); |
| 105 | 0 | rand = new DRand(); |
| 106 | 0 | } |
| 107 | ||
| 108 | protected void setRandomSeed(int random_seed) | |
| 109 | { | |
| 110 | 0 | rand = new DRand(random_seed); |
| 111 | 0 | } |
| 112 | ||
| 113 | /** | |
| 114 | * Returns a community (cluster) centered around <code>v</code>. | |
| 115 | * @param v the vertex whose community we wish to discover | |
| 116 | */ | |
| 117 | public Collection getCommunity(ArchetypeVertex v) | |
| 118 | { | |
| 119 | 0 | return cluster_internal(v.getGraph(), v, 2); |
| 120 | } | |
| 121 | ||
| 122 | /** | |
| 123 | * Clusters the vertices of <code>g</code> into | |
| 124 | * <code>num_clusters</code> clusters, based on their connectivity. | |
| 125 | * @param g the graph whose vertices are to be clustered | |
| 126 | * @param num_clusters the number of clusters to identify | |
| 127 | */ | |
| 128 | public Collection cluster(ArchetypeGraph g, int num_clusters) | |
| 129 | { | |
| 130 | 0 | return cluster_internal(g, null, num_clusters); |
| 131 | } | |
| 132 | ||
| 133 | /** | |
| 134 | * Clears the voltage decoration values from the vertices of <code>g</code>. | |
| 135 | */ | |
| 136 | public void clear(ArchetypeGraph g) | |
| 137 | { | |
| 138 | 0 | vv.clear(g); |
| 139 | 0 | } |
| 140 | ||
| 141 | /** | |
| 142 | * Does the work of <code>getCommunity</code> and <code>cluster</code>. | |
| 143 | * @param g the graph whose vertices we're clustering | |
| 144 | * @param origin the center (if one exists) of the graph to cluster | |
| 145 | * @param num_clusters | |
| 146 | */ | |
| 147 | protected Collection cluster_internal(ArchetypeGraph g, | |
| 148 | ArchetypeVertex origin, int num_clusters) | |
| 149 | { | |
| 150 | // generate candidate clusters | |
| 151 | // repeat the following 'samples' times: | |
| 152 | // * pick (widely separated) vertex pair, run VoltageRanker | |
| 153 | // * use k-means to identify 2 communities in ranked graph | |
| 154 | // * store resulting candidate communities | |
| 155 | 0 | Set vertices = g.getVertices(); |
| 156 | 0 | int num_vertices = vertices.size(); |
| 157 | 0 | ArchetypeVertex[] v = new ArchetypeVertex[num_vertices]; |
| 158 | 0 | int i = 0; |
| 159 | 0 | for (Iterator iter = vertices.iterator(); iter.hasNext(); ) |
| 160 | 0 | v[i++] = (ArchetypeVertex)iter.next(); |
| 161 | ||
| 162 | 0 | LinkedList candidates = new LinkedList(); |
| 163 | ||
| 164 | 0 | for (int j = 0; j < num_candidates; j++) |
| 165 | { | |
| 166 | ArchetypeVertex source; | |
| 167 | 0 | if (origin == null) |
| 168 | 0 | source = v[(int)(rand.nextDouble() * num_vertices)]; |
| 169 | else | |
| 170 | 0 | source = origin; |
| 171 | 0 | ArchetypeVertex target = null; |
| 172 | do | |
| 173 | { | |
| 174 | 0 | target = v[(int)(rand.nextDouble() * num_vertices)]; |
| 175 | } | |
| 176 | 0 | while (source == target); |
| 177 | 0 | vr.calculateVoltages((Vertex)source, (Vertex)target); |
| 178 | ||
| 179 | 0 | Map voltage_ranks = new HashMap(); |
| 180 | 0 | for (Iterator iter = vertices.iterator(); iter.hasNext(); ) |
| 181 | { | |
| 182 | 0 | ArchetypeVertex w = (ArchetypeVertex)iter.next(); |
| 183 | 0 | double[] voltage = {vv.getNumber(w).doubleValue()}; |
| 184 | 0 | voltage_ranks.put(w, voltage); |
| 185 | } | |
| 186 | ||
| 187 | // Map clusterMap; | |
| 188 | Collection clusters; | |
| 189 | try | |
| 190 | { | |
| 191 | 0 | clusters = kmc.cluster(voltage_ranks, 2); |
| 192 | 0 | Iterator iter = clusters.iterator(); |
| 193 | 0 | candidates.add(((Map)iter.next()).keySet()); |
| 194 | 0 | candidates.add(((Map)iter.next()).keySet()); |
| 195 | // candidates.addAll(clusters); | |
| 196 | } | |
| 197 | 0 | catch (NotEnoughClustersException e) |
| 198 | { | |
| 199 | // ignore this candidate, continue | |
| 200 | 0 | } |
| 201 | } | |
| 202 | ||
| 203 | // repeat the following k-1 times: | |
| 204 | // * pick a vertex v as a cluster seed | |
| 205 | // (Wu/Huberman: most frequent vertex in candidates) | |
| 206 | // * calculate co-occurrence (in candidate clusters) | |
| 207 | // of this vertex with all others | |
| 208 | // * use k-means to separate co-occurrence counts into high/low; | |
| 209 | // high vertices are a cluster | |
| 210 | // * remove v's vertices from candidate clusters | |
| 211 | ||
| 212 | 0 | Collection clusters = new LinkedList(); |
| 213 | 0 | Collection remaining = new HashSet(g.getVertices()); |
| 214 | 0 | Object[] seed_candidates = getSeedCandidates(candidates); |
| 215 | 0 | int seed_index = 0; |
| 216 | ||
| 217 | 0 | for (int j = 0; j < (num_clusters - 1); j++) |
| 218 | { | |
| 219 | 0 | if (remaining.isEmpty()) |
| 220 | 0 | break; |
| 221 | ||
| 222 | Object seed; | |
| 223 | 0 | if (seed_index == 0 && origin != null) |
| 224 | 0 | seed = origin; |
| 225 | else | |
| 226 | { | |
| 227 | 0 | do { seed = seed_candidates[seed_index++]; } |
| 228 | 0 | while (!remaining.contains(seed)); |
| 229 | } | |
| 230 | ||
| 231 | 0 | Map occur_counts = getObjectCounts(candidates, seed); |
| 232 | 0 | if (occur_counts.size() < 2) |
| 233 | 0 | break; |
| 234 | ||
| 235 | // now that we have the counts, cluster them... | |
| 236 | try | |
| 237 | { | |
| 238 | 0 | Collection high_low = kmc.cluster(occur_counts, 2); |
| 239 | // ...get the cluster with the highest-valued centroid... | |
| 240 | 0 | Iterator h_iter = high_low.iterator(); |
| 241 | 0 | Map cluster1 = (Map)h_iter.next(); |
| 242 | 0 | Map cluster2 = (Map)h_iter.next(); |
| 243 | 0 | double[] centroid1 = DiscreteDistribution.mean(cluster1.values()); |
| 244 | 0 | double[] centroid2 = DiscreteDistribution.mean(cluster2.values()); |
| 245 | // double[] centroid1 = (double[])h_iter.next(); | |
| 246 | // double[] centroid2 = (double[])h_iter.next(); | |
| 247 | Collection new_cluster; | |
| 248 | 0 | if (centroid1[0] >= centroid2[0]) |
| 249 | 0 | new_cluster = cluster1.keySet(); |
| 250 | else | |
| 251 | 0 | new_cluster = cluster2.keySet(); |
| 252 | ||
| 253 | // ...remove the elements of new_cluster from each candidate... | |
| 254 | 0 | for (Iterator iter = candidates.iterator(); iter.hasNext(); ) |
| 255 | { | |
| 256 | 0 | Collection cluster = (Collection)iter.next(); |
| 257 | 0 | cluster.removeAll(new_cluster); |
| 258 | } | |
| 259 | 0 | clusters.add(new_cluster); |
| 260 | 0 | remaining.removeAll(new_cluster); |
| 261 | } | |
| 262 | 0 | catch (NotEnoughClustersException nece) |
| 263 | { | |
| 264 | // all remaining vertices are in the same cluster | |
| 265 | 0 | break; |
| 266 | 0 | } |
| 267 | } | |
| 268 | ||
| 269 | // identify remaining vertices (if any) as a 'garbage' cluster | |
| 270 | 0 | if (!remaining.isEmpty()) |
| 271 | 0 | clusters.add(remaining); |
| 272 | ||
| 273 | 0 | return clusters; |
| 274 | } | |
| 275 | ||
| 276 | protected Object getRandomElement(Collection c) | |
| 277 | { | |
| 278 | 0 | return c.toArray()[(int)(rand.nextDouble() * c.size())]; |
| 279 | } | |
| 280 | ||
| 281 | /** | |
| 282 | * Returns an array of cluster seeds, ranked in decreasing order | |
| 283 | * of number of appearances in the specified collection of candidate | |
| 284 | * clusters. | |
| 285 | * @param candidates | |
| 286 | */ | |
| 287 | protected Object[] getSeedCandidates(Collection candidates) | |
| 288 | { | |
| 289 | 0 | final Map occur_counts = getObjectCounts(candidates, null); |
| 290 | ||
| 291 | 0 | Object[] occurrences = occur_counts.keySet().toArray(); |
| 292 | 0 | Arrays.sort(occurrences, new Comparator() |
| 293 | { | |
| 294 | public int compare(Object arg0, Object arg1) | |
| 295 | { | |
| 296 | double[] count0 = (double[])occur_counts.get(arg0); | |
| 297 | double[] count1 = (double[])occur_counts.get(arg1); | |
| 298 | if (count0[0] < count1[0]) | |
| 299 | return -1; | |
| 300 | else if (count0[0] > count1[0]) | |
| 301 | return 1; | |
| 302 | else | |
| 303 | return 0; | |
| 304 | } | |
| 305 | }); | |
| 306 | 0 | return occurrences; |
| 307 | } | |
| 308 | ||
| 309 | protected Map getObjectCounts(Collection candidates, Object seed) | |
| 310 | { | |
| 311 | 0 | Map occur_counts = new HashMap(); |
| 312 | 0 | for (Iterator iter = candidates.iterator(); iter.hasNext(); ) |
| 313 | { | |
| 314 | 0 | Collection candidate = (Collection)iter.next(); |
| 315 | 0 | if (seed == null || candidate.contains(seed)) |
| 316 | { | |
| 317 | 0 | for (Iterator c_iter = candidate.iterator(); c_iter.hasNext(); ) |
| 318 | { | |
| 319 | 0 | Object element = c_iter.next(); |
| 320 | 0 | double[] count = (double[])occur_counts.get(element); |
| 321 | 0 | if (count == null) |
| 322 | { | |
| 323 | 0 | count = new double[1]; |
| 324 | 0 | occur_counts.put(element, count); |
| 325 | } | |
| 326 | 0 | else count[0]++; |
| 327 | } | |
| 328 | } | |
| 329 | } | |
| 330 | ||
| 331 | 0 | return occur_counts; |
| 332 | } | |
| 333 | } |
|
this report was generated by version 1.0.5 of jcoverage. |
copyright © 2003, jcoverage ltd. all rights reserved. |