Skip to content

Clusterer

Implementation of Self-Organizing Map.

SOM(n_columns=None, n_rows=None, sigma=1.0, learning_rate=0.5, decay_function='inverse_decay_to_zero', neighborhood_function='gaussian', topology='rectangular', activation_distance='euclidean', random_state=None)

Bases: BaseEstimator, ClusterMixin

Class to fit and visualize a Self-Organizing Map (SOM).

The implementation uses MiniSom from minisom. Read more in the [user_guide].

Parameters:

Name Type Description Default
n_columns int | None

The number of columns in the map.

None
n_rows int | None

The number of rows in the map.

None
sigma float

Spread of the neighborhood function.

1.0
learning_rate float

Initial learning rate.

0.5
decay_function str | Callable

Function that reduces learning_rate and sigma at each iteration. Possible values: 'inverse_decay_to_zero', 'linear_decay_to_zero', 'asymptotic_decay' or callable.

'inverse_decay_to_zero'
neighborhood_function str

Function that weights the neighborhood of a position in the map. Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'.

'gaussian'
topology str

Topology of the map. Possible values: 'rectangular', 'hexagonal'.

'rectangular'
activation_distance str | Callable

Distance used to activate the map. Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev' or callable.

'euclidean'
random_state RandomState | int | None

Control the randomization of the algorithm.

  • If int, random_state is the seed used by the random number generator.
  • If RandomState instance, random_state is the random number generator.
  • If None, the random number generator is the RandomState instance used by np.random.
None
Source code in src/imblearn_extra/clover/clusterer/_som.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def __init__(
    self: Self,
    n_columns: int | None = None,
    n_rows: int | None = None,
    sigma: float = 1.0,
    learning_rate: float = 0.5,
    decay_function: str | Callable = 'inverse_decay_to_zero',
    neighborhood_function: str = 'gaussian',
    topology: str = 'rectangular',
    activation_distance: str | Callable = 'euclidean',
    random_state: np.random.RandomState | int | None = None,
) -> None:
    self.n_columns = n_columns
    self.n_rows = n_rows
    self.sigma = sigma
    self.learning_rate = learning_rate
    self.decay_function = decay_function
    self.neighborhood_function = neighborhood_function
    self.topology = topology
    self.activation_distance = activation_distance
    self.random_state = random_state

fit(X, y=None, **fit_params)

Train the self-organizing map.

Parameters:

Name Type Description Default
X ArrayLike

Training instances to cluster.

required
y ArrayLike | None

Ignored.

None
fit_params dict[str, Any]

Parameters to pass to train method of the MiniSom object.

The following parameters can be used:

num_iteration: If use_epochs is False, the weights will be updated num_iteration times. Otherwise they will be updated len(X) * num_iteration times.

random_order: If True, samples are picked in random order. Otherwise the samples are picked sequentially.

verbose: If True the status of the training will be printed each time the weights are updated.

use_epochs: If True the SOM will be trained for num_iteration epochs. In one epoch the weights are updated len(data) times and the learning rate is constat throughout a single epoch.

{}

Returns:

Type Description
Self

The object itself.

Source code in src/imblearn_extra/clover/clusterer/_som.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def fit(self: Self, X: npt.ArrayLike, y: npt.ArrayLike | None = None, **fit_params: dict[str, Any]) -> Self:
    """Train the self-organizing map.

    Args:
        X:
            Training instances to cluster.

        y:
            Ignored.

        fit_params:
            Parameters to pass to train method of the MiniSom object.

            The following parameters can be used:

            num_iteration: If `use_epochs` is `False`, the weights will be
            updated `num_iteration` times. Otherwise they will be updated
            `len(X) * num_iteration` times.

            random_order:
            If `True`, samples are picked in random order.
            Otherwise the samples are picked sequentially.

            verbose:
            If `True` the status of the training will be
            printed each time the weights are updated.

            use_epochs:
            If `True` the SOM will be trained for num_iteration epochs.
            In one epoch the weights are updated `len(data)` times and
            the learning rate is constat throughout a single epoch.

    Returns:
        The object itself.
    """
    # Check random state
    self.random_state_ = check_random_state(self.random_state).randint(low=np.iinfo(np.int32).max)

    # Check and normalize input data
    X_scaled = minmax_scale(check_array(X, dtype=np.float32))

    # Initialize size
    n_neurons = 5 * np.sqrt(X_scaled.shape[0])
    if self.n_rows is None and self.n_columns is None:
        self.n_rows_ = self.n_columns_ = int(np.ceil(np.sqrt(n_neurons)))
    elif self.n_rows is None and self.n_columns is not None:
        self.n_columns_ = self.n_columns
        self.n_rows_ = int(np.ceil(n_neurons / self.n_columns_))
    elif self.n_columns is None and self.n_rows is not None:
        self.n_rows_ = self.n_rows
        self.n_columns_ = int(np.ceil(n_neurons / self.n_rows_))
    elif self.n_columns is not None and self.n_rows is not None:
        self.n_rows_ = self.n_rows
        self.n_columns_ = self.n_columns

    # Create MiniSom object
    self.algorithm_ = MiniSom(
        x=self.n_rows_,
        y=self.n_columns_,
        input_len=X_scaled.shape[1],
        sigma=self.sigma,
        learning_rate=self.learning_rate,
        decay_function=self.decay_function,
        neighborhood_function=self.neighborhood_function,
        topology=self.topology,
        activation_distance=self.activation_distance,
        random_seed=self.random_state_,
    )

    # Fit MiniSom
    if 'num_iteration' not in fit_params:
        fit_params = {**fit_params, 'num_iteration': cast(Any, 1000)}
    self.algorithm_.train(data=X_scaled, **fit_params)

    # Grid labels
    labels_coords = [(int(i), int(j)) for i, j in [self.algorithm_.winner(x_scaled) for x_scaled in X_scaled]]

    # Generate labels mapping
    self.labels_mapping_ = generate_labels_mapping(labels_coords)

    # Generate cluster labels
    self.labels_ = np.array(
        [self.labels_mapping_[grid_label] for grid_label in labels_coords],
    )

    # Generate labels neighbors
    self.neighbors_ = self._generate_neighbors(
        sorted(set(labels_coords)),
        self.labels_mapping_,
    )

    return self

fit_predict(X, y=None, **fit_params)

Train the self-organizing map and assign cluster labels to samples.

Parameters:

Name Type Description Default
X ArrayLike

New data to transform.

required
y ArrayLike | None

Ignored.

None
fit_params dict[str, Any]

Parameters to pass to train method of the MiniSom object.

The following parameters can be used:

num_iteration: If use_epochs is False, the weights will be updated num_iteration times. Otherwise they will be updated len(X) * num_iteration times.

random_order: If True, samples are picked in random order. Otherwise the samples are picked sequentially.

verbose: If True the status of the training will be printed each time the weights are updated.

use_epochs: If True the SOM will be trained for num_iteration epochs. In one epoch the weights are updated len(data) times and the learning rate is constat throughout a single epoch.

{}

Returns:

Name Type Description
labels NDArray

Index of the cluster each sample belongs to.

Source code in src/imblearn_extra/clover/clusterer/_som.py
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def fit_predict(
    self: Self,
    X: npt.ArrayLike,
    y: npt.ArrayLike | None = None,
    **fit_params: dict[str, Any],
) -> npt.NDArray:
    """Train the self-organizing map and assign cluster labels to samples.

    Args:
        X:
            New data to transform.

        y:
            Ignored.

        fit_params:
            Parameters to pass to train method of the MiniSom object.

            The following parameters can be used:

            num_iteration: If `use_epochs` is `False`, the weights will be
            updated `num_iteration` times. Otherwise they will be updated
            `len(X) * num_iteration` times.

            random_order:
            If `True`, samples are picked in random order.
            Otherwise the samples are picked sequentially.

            verbose:
            If `True` the status of the training will be
            printed each time the weights are updated.

            use_epochs:
            If `True` the SOM will be trained for num_iteration epochs.
            In one epoch the weights are updated `len(data)` times and
            the learning rate is constat throughout a single epoch.

    Returns:
        labels:
            Index of the cluster each sample belongs to.
    """
    return self.fit(X=X, y=None, **fit_params).labels_

extract_topological_neighbors(col, row, topology, n_rows, n_columns, labels_coords_unique)

Return the topological neighbors of a neuron.

Source code in src/imblearn_extra/clover/clusterer/_som.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def extract_topological_neighbors(
    col: int,
    row: int,
    topology: str,
    n_rows: int,
    n_columns: int,
    labels_coords_unique: list[tuple[int, int]],
) -> list[tuple[int, int]]:
    """Return the topological neighbors of a neuron."""

    # Return common topological neighbors for the two grid types
    topological_neighbors = [
        (col - 1, row),
        (col + 1, row),
        (col, row - 1),
        (col, row + 1),
    ]

    # Append extra topological neighbors for hexagonal grid type
    if topology == 'hexagonal':
        offset = (-1) ** row
        topological_neighbors += [
            (col - offset, row - offset),
            (col - offset, row + offset),
        ]

    # Apply constraints
    topological_neighbors = [
        (col, row)
        for col, row in topological_neighbors
        if 0 <= col < n_columns and 0 <= row < n_rows and (col, row) in labels_coords_unique
    ]

    return topological_neighbors

generate_labels_mapping(labels_coords)

Generate a mapping between grid labels and cluster labels.

Source code in src/imblearn_extra/clover/clusterer/_som.py
19
20
21
22
23
24
25
26
27
28
def generate_labels_mapping(labels_coords: list[tuple[int, int]]) -> dict[tuple[int, int], int]:
    """Generate a mapping between grid labels and cluster labels."""

    # Identify unique grid labels
    unique_labels = sorted(set(labels_coords))

    # Generate mapping
    labels_mapping = dict(zip(unique_labels, range(len(unique_labels)), strict=True))

    return labels_mapping