zeimoto commited on
Commit
4f936d9
·
1 Parent(s): 6fd068e

matutils fix

Browse files
Files changed (2) hide show
  1. Dockerfile +12 -0
  2. matutils.py +1354 -0
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12.3
2
+
3
+ # Install pip requirements
4
+ COPY requirements.txt .
5
+ RUN python -m pip install -r requirements.txt
6
+
7
+ WORKDIR /app
8
+ COPY . /app
9
+
10
+ COPY ./matutils.py /home/adminuser/venv/lib/python3.11/site-packages/gensim/matutils.py
11
+
12
+ CMD ["python", "app.py"]
matutils.py ADDED
@@ -0,0 +1,1354 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ #
4
+ # Copyright (C) 2011 Radim Rehurek <[email protected]>
5
+ # Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
6
+
7
+ """Math helper functions."""
8
+
9
+ from __future__ import with_statement
10
+
11
+
12
+ import logging
13
+ import math
14
+
15
+ from gensim import utils
16
+
17
+ import numpy as np
18
+ import scipy.sparse
19
+ from scipy.stats import entropy
20
+ from scipy.linalg import get_blas_funcs#, triu
21
+ from scipy.linalg.lapack import get_lapack_funcs
22
+ from scipy.special import psi # gamma function utils
23
+
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ def blas(name, ndarray):
29
+ """Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`.
30
+
31
+ Parameters
32
+ ----------
33
+ name : str
34
+ Name(s) of BLAS functions, without the type prefix.
35
+ ndarray : numpy.ndarray
36
+ Arrays can be given to determine optimal prefix of BLAS routines.
37
+
38
+ Returns
39
+ -------
40
+ object
41
+ BLAS function for the needed operation on the given data type.
42
+
43
+ """
44
+ return get_blas_funcs((name,), (ndarray,))[0]
45
+
46
+
47
+ def argsort(x, topn=None, reverse=False):
48
+ """Efficiently calculate indices of the `topn` smallest elements in array `x`.
49
+
50
+ Parameters
51
+ ----------
52
+ x : array_like
53
+ Array to get the smallest element indices from.
54
+ topn : int, optional
55
+ Number of indices of the smallest (greatest) elements to be returned.
56
+ If not given, indices of all elements will be returned in ascending (descending) order.
57
+ reverse : bool, optional
58
+ Return the `topn` greatest elements in descending order,
59
+ instead of smallest elements in ascending order?
60
+
61
+ Returns
62
+ -------
63
+ numpy.ndarray
64
+ Array of `topn` indices that sort the array in the requested order.
65
+
66
+ """
67
+ x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...)
68
+ if topn is None:
69
+ topn = x.size
70
+ if topn <= 0:
71
+ return []
72
+ if reverse:
73
+ x = -x
74
+ if topn >= x.size or not hasattr(np, 'argpartition'):
75
+ return np.argsort(x)[:topn]
76
+ # np >= 1.8 has a fast partial argsort, use that!
77
+ most_extreme = np.argpartition(x, topn)[:topn]
78
+ return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order
79
+
80
+
81
+ def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
82
+ """Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`,
83
+ with documents as columns.
84
+
85
+ Notes
86
+ -----
87
+ If the number of terms, documents and non-zero elements is known, you can pass
88
+ them here as parameters and a (much) more memory efficient code path will be taken.
89
+
90
+ Parameters
91
+ ----------
92
+ corpus : iterable of iterable of (int, number)
93
+ Input corpus in BoW format
94
+ num_terms : int, optional
95
+ Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
96
+ dtype : data-type, optional
97
+ Data type of output CSC matrix.
98
+ num_docs : int, optional
99
+ Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored.
100
+ num_nnz : int, optional
101
+ Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored.
102
+ printprogress : int, optional
103
+ Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging.
104
+
105
+ Returns
106
+ -------
107
+ scipy.sparse.csc_matrix
108
+ `corpus` converted into a sparse CSC matrix.
109
+
110
+ See Also
111
+ --------
112
+ :class:`~gensim.matutils.Sparse2Corpus`
113
+ Convert sparse format to Gensim corpus format.
114
+
115
+ """
116
+ try:
117
+ # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
118
+ # (as is the case with MmCorpus for example), we can use a more efficient code path
119
+ if num_terms is None:
120
+ num_terms = corpus.num_terms
121
+ if num_docs is None:
122
+ num_docs = corpus.num_docs
123
+ if num_nnz is None:
124
+ num_nnz = corpus.num_nnz
125
+ except AttributeError:
126
+ pass # not a MmCorpus...
127
+ if printprogress:
128
+ logger.info("creating sparse matrix from corpus")
129
+ if num_terms is not None and num_docs is not None and num_nnz is not None:
130
+ # faster and much more memory-friendly version of creating the sparse csc
131
+ posnow, indptr = 0, [0]
132
+ indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer
133
+ data = np.empty((num_nnz,), dtype=dtype)
134
+ for docno, doc in enumerate(corpus):
135
+ if printprogress and docno % printprogress == 0:
136
+ logger.info("PROGRESS: at document #%i/%i", docno, num_docs)
137
+ posnext = posnow + len(doc)
138
+ # zip(*doc) transforms doc to (token_indices, token_counts]
139
+ indices[posnow: posnext], data[posnow: posnext] = zip(*doc) if doc else ([], [])
140
+ indptr.append(posnext)
141
+ posnow = posnext
142
+ assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros"
143
+ result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
144
+ else:
145
+ # slower version; determine the sparse matrix parameters during iteration
146
+ num_nnz, data, indices, indptr = 0, [], [], [0]
147
+ for docno, doc in enumerate(corpus):
148
+ if printprogress and docno % printprogress == 0:
149
+ logger.info("PROGRESS: at document #%i", docno)
150
+
151
+ # zip(*doc) transforms doc to (token_indices, token_counts]
152
+ doc_indices, doc_data = zip(*doc) if doc else ([], [])
153
+ indices.extend(doc_indices)
154
+ data.extend(doc_data)
155
+ num_nnz += len(doc)
156
+ indptr.append(num_nnz)
157
+ if num_terms is None:
158
+ num_terms = max(indices) + 1 if indices else 0
159
+ num_docs = len(indptr) - 1
160
+ # now num_docs, num_terms and num_nnz contain the correct values
161
+ data = np.asarray(data, dtype=dtype)
162
+ indices = np.asarray(indices)
163
+ result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
164
+ return result
165
+
166
+
167
+ def pad(mat, padrow, padcol):
168
+ """Add additional rows/columns to `mat`. The new rows/columns will be initialized with zeros.
169
+
170
+ Parameters
171
+ ----------
172
+ mat : numpy.ndarray
173
+ Input 2D matrix
174
+ padrow : int
175
+ Number of additional rows
176
+ padcol : int
177
+ Number of additional columns
178
+
179
+ Returns
180
+ -------
181
+ numpy.matrixlib.defmatrix.matrix
182
+ Matrix with needed padding.
183
+
184
+ """
185
+ if padrow < 0:
186
+ padrow = 0
187
+ if padcol < 0:
188
+ padcol = 0
189
+ rows, cols = mat.shape
190
+ return np.block([
191
+ [mat, np.zeros((rows, padcol))],
192
+ [np.zeros((padrow, cols + padcol))],
193
+ ])
194
+
195
+
196
+ def zeros_aligned(shape, dtype, order='C', align=128):
197
+ """Get array aligned at `align` byte boundary in memory.
198
+
199
+ Parameters
200
+ ----------
201
+ shape : int or (int, int)
202
+ Shape of array.
203
+ dtype : data-type
204
+ Data type of array.
205
+ order : {'C', 'F'}, optional
206
+ Whether to store multidimensional data in C- or Fortran-contiguous (row- or column-wise) order in memory.
207
+ align : int, optional
208
+ Boundary for alignment in bytes.
209
+
210
+ Returns
211
+ -------
212
+ numpy.ndarray
213
+ Aligned array.
214
+
215
+ """
216
+ nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize
217
+ buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded")
218
+ start_index = -buffer.ctypes.data % align
219
+ return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order)
220
+
221
+
222
+ def ismatrix(m):
223
+ """Check whether `m` is a 2D `numpy.ndarray` or `scipy.sparse` matrix.
224
+
225
+ Parameters
226
+ ----------
227
+ m : object
228
+ Object to check.
229
+
230
+ Returns
231
+ -------
232
+ bool
233
+ Is `m` a 2D `numpy.ndarray` or `scipy.sparse` matrix.
234
+
235
+ """
236
+ return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
237
+
238
+
239
+ def any2sparse(vec, eps=1e-9):
240
+ """Convert a numpy.ndarray or `scipy.sparse` vector into the Gensim bag-of-words format.
241
+
242
+ Parameters
243
+ ----------
244
+ vec : {`numpy.ndarray`, `scipy.sparse`}
245
+ Input vector
246
+ eps : float, optional
247
+ Value used for threshold, all coordinates less than `eps` will not be presented in result.
248
+
249
+ Returns
250
+ -------
251
+ list of (int, float)
252
+ Vector in BoW format.
253
+
254
+ """
255
+ if isinstance(vec, np.ndarray):
256
+ return dense2vec(vec, eps)
257
+ if scipy.sparse.issparse(vec):
258
+ return scipy2sparse(vec, eps)
259
+ return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]
260
+
261
+
262
+ def scipy2scipy_clipped(matrix, topn, eps=1e-9):
263
+ """Get the 'topn' elements of the greatest magnitude (absolute value) from a `scipy.sparse` vector or matrix.
264
+
265
+ Parameters
266
+ ----------
267
+ matrix : `scipy.sparse`
268
+ Input vector or matrix (1D or 2D sparse array).
269
+ topn : int
270
+ Number of greatest elements, in absolute value, to return.
271
+ eps : float
272
+ Ignored.
273
+
274
+ Returns
275
+ -------
276
+ `scipy.sparse.csr.csr_matrix`
277
+ Clipped matrix.
278
+
279
+ """
280
+ if not scipy.sparse.issparse(matrix):
281
+ raise ValueError("'%s' is not a scipy sparse vector." % matrix)
282
+ if topn <= 0:
283
+ return scipy.sparse.csr_matrix([])
284
+ # Return clipped sparse vector if input is a sparse vector.
285
+ if matrix.shape[0] == 1:
286
+ # use np.argpartition/argsort and only form tuples that are actually returned.
287
+ biggest = argsort(abs(matrix.data), topn, reverse=True)
288
+ indices, data = matrix.indices.take(biggest), matrix.data.take(biggest)
289
+ return scipy.sparse.csr_matrix((data, indices, [0, len(indices)]))
290
+ # Return clipped sparse matrix if input is a matrix, processing row by row.
291
+ else:
292
+ matrix_indices = []
293
+ matrix_data = []
294
+ matrix_indptr = [0]
295
+ # calling abs() on entire matrix once is faster than calling abs() iteratively for each row
296
+ matrix_abs = abs(matrix)
297
+ for i in range(matrix.shape[0]):
298
+ v = matrix.getrow(i)
299
+ v_abs = matrix_abs.getrow(i)
300
+ # Sort and clip each row vector first.
301
+ biggest = argsort(v_abs.data, topn, reverse=True)
302
+ indices, data = v.indices.take(biggest), v.data.take(biggest)
303
+ # Store the topn indices and values of each row vector.
304
+ matrix_data.append(data)
305
+ matrix_indices.append(indices)
306
+ matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn))
307
+ matrix_indices = np.concatenate(matrix_indices).ravel()
308
+ matrix_data = np.concatenate(matrix_data).ravel()
309
+ # Instantiate and return a sparse csr_matrix which preserves the order of indices/data.
310
+ return scipy.sparse.csr.csr_matrix(
311
+ (matrix_data, matrix_indices, matrix_indptr),
312
+ shape=(matrix.shape[0], np.max(matrix_indices) + 1)
313
+ )
314
+
315
+
316
+ def scipy2sparse(vec, eps=1e-9):
317
+ """Convert a scipy.sparse vector into the Gensim bag-of-words format.
318
+
319
+ Parameters
320
+ ----------
321
+ vec : `scipy.sparse`
322
+ Sparse vector.
323
+
324
+ eps : float, optional
325
+ Value used for threshold, all coordinates less than `eps` will not be presented in result.
326
+
327
+ Returns
328
+ -------
329
+ list of (int, float)
330
+ Vector in Gensim bag-of-words format.
331
+
332
+ """
333
+ vec = vec.tocsr()
334
+ assert vec.shape[0] == 1
335
+ return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps]
336
+
337
+
338
+ class Scipy2Corpus:
339
+ """Convert a sequence of dense/sparse vectors into a streamed Gensim corpus object.
340
+
341
+ See Also
342
+ --------
343
+ :func:`~gensim.matutils.corpus2csc`
344
+ Convert corpus in Gensim format to `scipy.sparse.csc` matrix.
345
+
346
+ """
347
+ def __init__(self, vecs):
348
+ """
349
+
350
+ Parameters
351
+ ----------
352
+ vecs : iterable of {`numpy.ndarray`, `scipy.sparse`}
353
+ Input vectors.
354
+
355
+ """
356
+ self.vecs = vecs
357
+
358
+ def __iter__(self):
359
+ for vec in self.vecs:
360
+ if isinstance(vec, np.ndarray):
361
+ yield full2sparse(vec)
362
+ else:
363
+ yield scipy2sparse(vec)
364
+
365
+ def __len__(self):
366
+ return len(self.vecs)
367
+
368
+
369
+ def sparse2full(doc, length):
370
+ """Convert a document in Gensim bag-of-words format into a dense numpy array.
371
+
372
+ Parameters
373
+ ----------
374
+ doc : list of (int, number)
375
+ Document in BoW format.
376
+ length : int
377
+ Vector dimensionality. This cannot be inferred from the BoW, and you must supply it explicitly.
378
+ This is typically the vocabulary size or number of topics, depending on how you created `doc`.
379
+
380
+ Returns
381
+ -------
382
+ numpy.ndarray
383
+ Dense numpy vector for `doc`.
384
+
385
+ See Also
386
+ --------
387
+ :func:`~gensim.matutils.full2sparse`
388
+ Convert dense array to gensim bag-of-words format.
389
+
390
+ """
391
+ result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value)
392
+ # convert indices to int as numpy 1.12 no longer indexes by floats
393
+ doc = ((int(id_), float(val_)) for (id_, val_) in doc)
394
+
395
+ doc = dict(doc)
396
+ # overwrite some of the zeroes with explicit values
397
+ result[list(doc)] = list(doc.values())
398
+ return result
399
+
400
+
401
+ def full2sparse(vec, eps=1e-9):
402
+ """Convert a dense numpy array into the Gensim bag-of-words format.
403
+
404
+ Parameters
405
+ ----------
406
+ vec : numpy.ndarray
407
+ Dense input vector.
408
+ eps : float
409
+ Feature weight threshold value. Features with `abs(weight) < eps` are considered sparse and
410
+ won't be included in the BOW result.
411
+
412
+ Returns
413
+ -------
414
+ list of (int, float)
415
+ BoW format of `vec`, with near-zero values omitted (sparse vector).
416
+
417
+ See Also
418
+ --------
419
+ :func:`~gensim.matutils.sparse2full`
420
+ Convert a document in Gensim bag-of-words format into a dense numpy array.
421
+
422
+ """
423
+ vec = np.asarray(vec, dtype=float)
424
+ nnz = np.nonzero(abs(vec) > eps)[0]
425
+ return list(zip(nnz, vec.take(nnz)))
426
+
427
+
428
+ dense2vec = full2sparse
429
+
430
+
431
+ def full2sparse_clipped(vec, topn, eps=1e-9):
432
+ """Like :func:`~gensim.matutils.full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
433
+
434
+ This is more efficient that sorting a vector and then taking the greatest values, especially
435
+ where `len(vec) >> topn`.
436
+
437
+ Parameters
438
+ ----------
439
+ vec : numpy.ndarray
440
+ Input dense vector
441
+ topn : int
442
+ Number of greatest (abs) elements that will be presented in result.
443
+ eps : float
444
+ Threshold value, if coordinate in `vec` < eps, this will not be presented in result.
445
+
446
+ Returns
447
+ -------
448
+ list of (int, float)
449
+ Clipped vector in BoW format.
450
+
451
+ See Also
452
+ --------
453
+ :func:`~gensim.matutils.full2sparse`
454
+ Convert dense array to gensim bag-of-words format.
455
+
456
+ """
457
+ # use np.argpartition/argsort and only form tuples that are actually returned.
458
+ # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
459
+ if topn <= 0:
460
+ return []
461
+ vec = np.asarray(vec, dtype=float)
462
+ nnz = np.nonzero(abs(vec) > eps)[0]
463
+ biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True))
464
+ return list(zip(biggest, vec.take(biggest)))
465
+
466
+
467
+ def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
468
+ """Convert corpus into a dense numpy 2D array, with documents as columns.
469
+
470
+ Parameters
471
+ ----------
472
+ corpus : iterable of iterable of (int, number)
473
+ Input corpus in the Gensim bag-of-words format.
474
+ num_terms : int
475
+ Number of terms in the dictionary. X-axis of the resulting matrix.
476
+ num_docs : int, optional
477
+ Number of documents in the corpus. If provided, a slightly more memory-efficient code path is taken.
478
+ Y-axis of the resulting matrix.
479
+ dtype : data-type, optional
480
+ Data type of the output matrix.
481
+
482
+ Returns
483
+ -------
484
+ numpy.ndarray
485
+ Dense 2D array that presents `corpus`.
486
+
487
+ See Also
488
+ --------
489
+ :class:`~gensim.matutils.Dense2Corpus`
490
+ Convert dense matrix to Gensim corpus format.
491
+
492
+ """
493
+ if num_docs is not None:
494
+ # we know the number of documents => don't bother column_stacking
495
+ docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype)
496
+ for docno, doc in enumerate(corpus):
497
+ result[:, docno] = sparse2full(doc, num_terms)
498
+ assert docno + 1 == num_docs
499
+ else:
500
+ # The below used to be a generator, but NumPy deprecated generator as of 1.16 with:
501
+ # """
502
+ # FutureWarning: arrays to stack must be passed as a "sequence" type such as list or tuple.
503
+ # Support for non-sequence iterables such as generators is deprecated as of NumPy 1.16 and will raise an error
504
+ # in the future.
505
+ # """
506
+ result = np.column_stack([sparse2full(doc, num_terms) for doc in corpus])
507
+ return result.astype(dtype)
508
+
509
+
510
+ class Dense2Corpus:
511
+ """Treat dense numpy array as a streamed Gensim corpus in the bag-of-words format.
512
+
513
+ Notes
514
+ -----
515
+ No data copy is made (changes to the underlying matrix imply changes in the streamed corpus).
516
+
517
+ See Also
518
+ --------
519
+ :func:`~gensim.matutils.corpus2dense`
520
+ Convert Gensim corpus to dense matrix.
521
+ :class:`~gensim.matutils.Sparse2Corpus`
522
+ Convert sparse matrix to Gensim corpus format.
523
+
524
+ """
525
+ def __init__(self, dense, documents_columns=True):
526
+ """
527
+
528
+ Parameters
529
+ ----------
530
+ dense : numpy.ndarray
531
+ Corpus in dense format.
532
+ documents_columns : bool, optional
533
+ Documents in `dense` represented as columns, as opposed to rows?
534
+
535
+ """
536
+ if documents_columns:
537
+ self.dense = dense.T
538
+ else:
539
+ self.dense = dense
540
+
541
+ def __iter__(self):
542
+ """Iterate over the corpus.
543
+
544
+ Yields
545
+ ------
546
+ list of (int, float)
547
+ Document in BoW format.
548
+
549
+ """
550
+ for doc in self.dense:
551
+ yield full2sparse(doc.flat)
552
+
553
+ def __len__(self):
554
+ return len(self.dense)
555
+
556
+
557
+ class Sparse2Corpus:
558
+ """Convert a matrix in scipy.sparse format into a streaming Gensim corpus.
559
+
560
+ See Also
561
+ --------
562
+ :func:`~gensim.matutils.corpus2csc`
563
+ Convert gensim corpus format to `scipy.sparse.csc` matrix
564
+ :class:`~gensim.matutils.Dense2Corpus`
565
+ Convert dense matrix to gensim corpus.
566
+
567
+ """
568
+ def __init__(self, sparse, documents_columns=True):
569
+ """
570
+
571
+ Parameters
572
+ ----------
573
+ sparse : `scipy.sparse`
574
+ Corpus scipy sparse format
575
+ documents_columns : bool, optional
576
+ Documents will be column?
577
+
578
+ """
579
+ if documents_columns:
580
+ self.sparse = sparse.tocsc()
581
+ else:
582
+ self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len())
583
+
584
+ def __iter__(self):
585
+ """
586
+
587
+ Yields
588
+ ------
589
+ list of (int, float)
590
+ Document in BoW format.
591
+
592
+ """
593
+ for indprev, indnow in zip(self.sparse.indptr, self.sparse.indptr[1:]):
594
+ yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
595
+
596
+ def __len__(self):
597
+ return self.sparse.shape[1]
598
+
599
+ def __getitem__(self, key):
600
+ """
601
+ Retrieve a document vector or subset from the corpus by key.
602
+
603
+ Parameters
604
+ ----------
605
+ key: int, ellipsis, slice, iterable object
606
+ Index of the document retrieve.
607
+ Less commonly, the key can also be a slice, ellipsis, or an iterable
608
+ to retrieve multiple documents.
609
+
610
+ Returns
611
+ -------
612
+ list of (int, number), Sparse2Corpus
613
+ Document in BoW format when `key` is an integer. Otherwise :class:`~gensim.matutils.Sparse2Corpus`.
614
+ """
615
+ sparse = self.sparse
616
+ if isinstance(key, int):
617
+ iprev = self.sparse.indptr[key]
618
+ inow = self.sparse.indptr[key + 1]
619
+ return list(zip(sparse.indices[iprev:inow], sparse.data[iprev:inow]))
620
+
621
+ sparse = self.sparse.__getitem__((slice(None, None, None), key))
622
+ return Sparse2Corpus(sparse)
623
+
624
+
625
+ def veclen(vec):
626
+ """Calculate L2 (euclidean) length of a vector.
627
+
628
+ Parameters
629
+ ----------
630
+ vec : list of (int, number)
631
+ Input vector in sparse bag-of-words format.
632
+
633
+ Returns
634
+ -------
635
+ float
636
+ Length of `vec`.
637
+
638
+ """
639
+ if len(vec) == 0:
640
+ return 0.0
641
+ length = 1.0 * math.sqrt(sum(val**2 for _, val in vec))
642
+ assert length > 0.0, "sparse documents must not contain any explicit zero entries"
643
+ return length
644
+
645
+
646
+ def ret_normalized_vec(vec, length):
647
+ """Normalize a vector in L2 (Euclidean unit norm).
648
+
649
+ Parameters
650
+ ----------
651
+ vec : list of (int, number)
652
+ Input vector in BoW format.
653
+ length : float
654
+ Length of vector
655
+
656
+ Returns
657
+ -------
658
+ list of (int, number)
659
+ L2-normalized vector in BoW format.
660
+
661
+ """
662
+ if length != 1.0:
663
+ return [(termid, val / length) for termid, val in vec]
664
+ else:
665
+ return list(vec)
666
+
667
+
668
+ def ret_log_normalize_vec(vec, axis=1):
669
+ log_max = 100.0
670
+ if len(vec.shape) == 1:
671
+ max_val = np.max(vec)
672
+ log_shift = log_max - np.log(len(vec) + 1.0) - max_val
673
+ tot = np.sum(np.exp(vec + log_shift))
674
+ log_norm = np.log(tot) - log_shift
675
+ vec -= log_norm
676
+ else:
677
+ if axis == 1: # independently normalize each sample
678
+ max_val = np.max(vec, 1)
679
+ log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val
680
+ tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1)
681
+ log_norm = np.log(tot) - log_shift
682
+ vec = vec - log_norm[:, np.newaxis]
683
+ elif axis == 0: # normalize each feature
684
+ k = ret_log_normalize_vec(vec.T)
685
+ return k[0].T, k[1]
686
+ else:
687
+ raise ValueError("'%s' is not a supported axis" % axis)
688
+ return vec, log_norm
689
+
690
+
691
+ blas_nrm2 = blas('nrm2', np.array([], dtype=float))
692
+ blas_scal = blas('scal', np.array([], dtype=float))
693
+
694
+
695
+ def unitvec(vec, norm='l2', return_norm=False):
696
+ """Scale a vector to unit length.
697
+
698
+ Parameters
699
+ ----------
700
+ vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
701
+ Input vector in any format
702
+ norm : {'l1', 'l2', 'unique'}, optional
703
+ Metric to normalize in.
704
+ return_norm : bool, optional
705
+ Return the length of vector `vec`, in addition to the normalized vector itself?
706
+
707
+ Returns
708
+ -------
709
+ numpy.ndarray, scipy.sparse, list of (int, float)}
710
+ Normalized vector in same format as `vec`.
711
+ float
712
+ Length of `vec` before normalization, if `return_norm` is set.
713
+
714
+ Notes
715
+ -----
716
+ Zero-vector will be unchanged.
717
+
718
+ """
719
+ supported_norms = ('l1', 'l2', 'unique')
720
+ if norm not in supported_norms:
721
+ raise ValueError("'%s' is not a supported norm. Currently supported norms are %s." % (norm, supported_norms))
722
+
723
+ if scipy.sparse.issparse(vec):
724
+ vec = vec.tocsr()
725
+ if norm == 'l1':
726
+ veclen = np.sum(np.abs(vec.data))
727
+ if norm == 'l2':
728
+ veclen = np.sqrt(np.sum(vec.data ** 2))
729
+ if norm == 'unique':
730
+ veclen = vec.nnz
731
+ if veclen > 0.0:
732
+ if np.issubdtype(vec.dtype, np.integer):
733
+ vec = vec.astype(float)
734
+ vec /= veclen
735
+ if return_norm:
736
+ return vec, veclen
737
+ else:
738
+ return vec
739
+ else:
740
+ if return_norm:
741
+ return vec, 1.0
742
+ else:
743
+ return vec
744
+
745
+ if isinstance(vec, np.ndarray):
746
+ if norm == 'l1':
747
+ veclen = np.sum(np.abs(vec))
748
+ if norm == 'l2':
749
+ if vec.size == 0:
750
+ veclen = 0.0
751
+ else:
752
+ veclen = blas_nrm2(vec)
753
+ if norm == 'unique':
754
+ veclen = np.count_nonzero(vec)
755
+ if veclen > 0.0:
756
+ if np.issubdtype(vec.dtype, np.integer):
757
+ vec = vec.astype(float)
758
+ if return_norm:
759
+ return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
760
+ else:
761
+ return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
762
+ else:
763
+ if return_norm:
764
+ return vec, 1.0
765
+ else:
766
+ return vec
767
+
768
+ try:
769
+ first = next(iter(vec)) # is there at least one element?
770
+ except StopIteration:
771
+ if return_norm:
772
+ return vec, 1.0
773
+ else:
774
+ return vec
775
+
776
+ if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format
777
+ if norm == 'l1':
778
+ length = float(sum(abs(val) for _, val in vec))
779
+ if norm == 'l2':
780
+ length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
781
+ if norm == 'unique':
782
+ length = 1.0 * len(vec)
783
+ assert length > 0.0, "sparse documents must not contain any explicit zero entries"
784
+ if return_norm:
785
+ return ret_normalized_vec(vec, length), length
786
+ else:
787
+ return ret_normalized_vec(vec, length)
788
+ else:
789
+ raise ValueError("unknown input type")
790
+
791
+
792
+ def cossim(vec1, vec2):
793
+ """Get cosine similarity between two sparse vectors.
794
+
795
+ Cosine similarity is a number between `<-1.0, 1.0>`, higher means more similar.
796
+
797
+ Parameters
798
+ ----------
799
+ vec1 : list of (int, float)
800
+ Vector in BoW format.
801
+ vec2 : list of (int, float)
802
+ Vector in BoW format.
803
+
804
+ Returns
805
+ -------
806
+ float
807
+ Cosine similarity between `vec1` and `vec2`.
808
+
809
+ """
810
+ vec1, vec2 = dict(vec1), dict(vec2)
811
+ if not vec1 or not vec2:
812
+ return 0.0
813
+ vec1len = 1.0 * math.sqrt(sum(val * val for val in vec1.values()))
814
+ vec2len = 1.0 * math.sqrt(sum(val * val for val in vec2.values()))
815
+ assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
816
+ if len(vec2) < len(vec1):
817
+ vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
818
+ result = sum(value * vec2.get(index, 0.0) for index, value in vec1.items())
819
+ result /= vec1len * vec2len # rescale by vector lengths
820
+ return result
821
+
822
+
823
+ def isbow(vec):
824
+ """Checks if a vector is in the sparse Gensim bag-of-words format.
825
+
826
+ Parameters
827
+ ----------
828
+ vec : object
829
+ Object to check.
830
+
831
+ Returns
832
+ -------
833
+ bool
834
+ Is `vec` in BoW format.
835
+
836
+ """
837
+ if scipy.sparse.issparse(vec):
838
+ vec = vec.todense().tolist()
839
+ try:
840
+ id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking
841
+ int(id_), float(val_)
842
+ except IndexError:
843
+ return True # this is to handle the empty input case
844
+ except (ValueError, TypeError):
845
+ return False
846
+ return True
847
+
848
+
849
+ def _convert_vec(vec1, vec2, num_features=None):
850
+ if scipy.sparse.issparse(vec1):
851
+ vec1 = vec1.toarray()
852
+ if scipy.sparse.issparse(vec2):
853
+ vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix
854
+ if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
855
+ if num_features is not None: # if not None, make as large as the documents drawing from
856
+ dense1 = sparse2full(vec1, num_features)
857
+ dense2 = sparse2full(vec2, num_features)
858
+ return dense1, dense2
859
+ else:
860
+ max_len = max(len(vec1), len(vec2))
861
+ dense1 = sparse2full(vec1, max_len)
862
+ dense2 = sparse2full(vec2, max_len)
863
+ return dense1, dense2
864
+ else:
865
+ # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
866
+ # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
867
+ if len(vec1) == 1:
868
+ vec1 = vec1[0]
869
+ if len(vec2) == 1:
870
+ vec2 = vec2[0]
871
+ return vec1, vec2
872
+
873
+
874
+ def kullback_leibler(vec1, vec2, num_features=None):
875
+ """Calculate Kullback-Leibler distance between two probability distributions using `scipy.stats.entropy`.
876
+
877
+ Parameters
878
+ ----------
879
+ vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
880
+ Distribution vector.
881
+ vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
882
+ Distribution vector.
883
+ num_features : int, optional
884
+ Number of features in the vectors.
885
+
886
+ Returns
887
+ -------
888
+ float
889
+ Kullback-Leibler distance between `vec1` and `vec2`.
890
+ Value in range [0, +∞) where values closer to 0 mean less distance (higher similarity).
891
+
892
+ """
893
+ vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features)
894
+ return entropy(vec1, vec2)
895
+
896
+
897
+ def jensen_shannon(vec1, vec2, num_features=None):
898
+ """Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`.
899
+
900
+ Parameters
901
+ ----------
902
+ vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
903
+ Distribution vector.
904
+ vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
905
+ Distribution vector.
906
+ num_features : int, optional
907
+ Number of features in the vectors.
908
+
909
+ Returns
910
+ -------
911
+ float
912
+ Jensen-Shannon distance between `vec1` and `vec2`.
913
+
914
+ Notes
915
+ -----
916
+ This is a symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`.
917
+
918
+ """
919
+ vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features)
920
+ avg_vec = 0.5 * (vec1 + vec2)
921
+ return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
922
+
923
+
924
+ def hellinger(vec1, vec2):
925
+ """Calculate Hellinger distance between two probability distributions.
926
+
927
+ Parameters
928
+ ----------
929
+ vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
930
+ Distribution vector.
931
+ vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
932
+ Distribution vector.
933
+
934
+ Returns
935
+ -------
936
+ float
937
+ Hellinger distance between `vec1` and `vec2`.
938
+ Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
939
+
940
+ """
941
+ if scipy.sparse.issparse(vec1):
942
+ vec1 = vec1.toarray()
943
+ if scipy.sparse.issparse(vec2):
944
+ vec2 = vec2.toarray()
945
+ if isbow(vec1) and isbow(vec2):
946
+ # if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance
947
+ vec1, vec2 = dict(vec1), dict(vec2)
948
+ indices = set(list(vec1.keys()) + list(vec2.keys()))
949
+ sim = np.sqrt(
950
+ 0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices)
951
+ )
952
+ return sim
953
+ else:
954
+ sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum())
955
+ return sim
956
+
957
+
958
+ def jaccard(vec1, vec2):
959
+ """Calculate Jaccard distance between two vectors.
960
+
961
+ Parameters
962
+ ----------
963
+ vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
964
+ Distribution vector.
965
+ vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
966
+ Distribution vector.
967
+
968
+ Returns
969
+ -------
970
+ float
971
+ Jaccard distance between `vec1` and `vec2`.
972
+ Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
973
+
974
+ """
975
+
976
+ # converting from sparse for easier manipulation
977
+ if scipy.sparse.issparse(vec1):
978
+ vec1 = vec1.toarray()
979
+ if scipy.sparse.issparse(vec2):
980
+ vec2 = vec2.toarray()
981
+ if isbow(vec1) and isbow(vec2):
982
+ # if it's in bow format, we use the following definitions:
983
+ # union = sum of the 'weights' of both the bags
984
+ # intersection = lowest weight for a particular id; basically the number of common words or items
985
+ union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2)
986
+ vec1, vec2 = dict(vec1), dict(vec2)
987
+ intersection = 0.0
988
+ for feature_id, feature_weight in vec1.items():
989
+ intersection += min(feature_weight, vec2.get(feature_id, 0.0))
990
+ return 1 - float(intersection) / float(union)
991
+ else:
992
+ # if it isn't in bag of words format, we can use sets to calculate intersection and union
993
+ if isinstance(vec1, np.ndarray):
994
+ vec1 = vec1.tolist()
995
+ if isinstance(vec2, np.ndarray):
996
+ vec2 = vec2.tolist()
997
+ vec1 = set(vec1)
998
+ vec2 = set(vec2)
999
+ intersection = vec1 & vec2
1000
+ union = vec1 | vec2
1001
+ return 1 - float(len(intersection)) / float(len(union))
1002
+
1003
+
1004
+ def jaccard_distance(set1, set2):
1005
+ """Calculate Jaccard distance between two sets.
1006
+
1007
+ Parameters
1008
+ ----------
1009
+ set1 : set
1010
+ Input set.
1011
+ set2 : set
1012
+ Input set.
1013
+
1014
+ Returns
1015
+ -------
1016
+ float
1017
+ Jaccard distance between `set1` and `set2`.
1018
+ Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
1019
+ """
1020
+
1021
+ union_cardinality = len(set1 | set2)
1022
+ if union_cardinality == 0: # Both sets are empty
1023
+ return 1.
1024
+
1025
+ return 1. - float(len(set1 & set2)) / float(union_cardinality)
1026
+
1027
+
1028
+ try:
1029
+ # try to load fast, cythonized code if possible
1030
+ from gensim._matutils import logsumexp, mean_absolute_difference, dirichlet_expectation
1031
+
1032
+ except ImportError:
1033
+ def logsumexp(x):
1034
+ """Log of sum of exponentials.
1035
+
1036
+ Parameters
1037
+ ----------
1038
+ x : numpy.ndarray
1039
+ Input 2d matrix.
1040
+
1041
+ Returns
1042
+ -------
1043
+ float
1044
+ log of sum of exponentials of elements in `x`.
1045
+
1046
+ Warnings
1047
+ --------
1048
+ For performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
1049
+
1050
+ """
1051
+ x_max = np.max(x)
1052
+ x = np.log(np.sum(np.exp(x - x_max)))
1053
+ x += x_max
1054
+
1055
+ return x
1056
+
1057
+ def mean_absolute_difference(a, b):
1058
+ """Mean absolute difference between two arrays.
1059
+
1060
+ Parameters
1061
+ ----------
1062
+ a : numpy.ndarray
1063
+ Input 1d array.
1064
+ b : numpy.ndarray
1065
+ Input 1d array.
1066
+
1067
+ Returns
1068
+ -------
1069
+ float
1070
+ mean(abs(a - b)).
1071
+
1072
+ """
1073
+ return np.mean(np.abs(a - b))
1074
+
1075
+ def dirichlet_expectation(alpha):
1076
+ """Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
1077
+
1078
+ Parameters
1079
+ ----------
1080
+ alpha : numpy.ndarray
1081
+ Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.
1082
+
1083
+ Returns
1084
+ -------
1085
+ numpy.ndarray
1086
+ Log of expected values, dimension same as `alpha.ndim`.
1087
+
1088
+ """
1089
+ if len(alpha.shape) == 1:
1090
+ result = psi(alpha) - psi(np.sum(alpha))
1091
+ else:
1092
+ result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
1093
+ return result.astype(alpha.dtype, copy=False) # keep the same precision as input
1094
+
1095
+
1096
+ def qr_destroy(la):
1097
+ """Get QR decomposition of `la[0]`.
1098
+
1099
+ Parameters
1100
+ ----------
1101
+ la : list of numpy.ndarray
1102
+ Run QR decomposition on the first elements of `la`. Must not be empty.
1103
+
1104
+ Returns
1105
+ -------
1106
+ (numpy.ndarray, numpy.ndarray)
1107
+ Matrices :math:`Q` and :math:`R`.
1108
+
1109
+ Notes
1110
+ -----
1111
+ Using this function is less memory intense than calling `scipy.linalg.qr(la[0])`,
1112
+ because the memory used in `la[0]` is reclaimed earlier. This makes a difference when
1113
+ decomposing very large arrays, where every memory copy counts.
1114
+
1115
+ Warnings
1116
+ --------
1117
+ Content of `la` as well as `la[0]` gets destroyed in the process. Again, for memory-effiency reasons.
1118
+
1119
+ """
1120
+ a = np.asfortranarray(la[0])
1121
+ del la[0], la # now `a` is the only reference to the input matrix
1122
+ m, n = a.shape
1123
+ # perform q, r = QR(a); code hacked out of scipy.linalg.qr
1124
+ logger.debug("computing QR of %s dense matrix", str(a.shape))
1125
+ geqrf, = get_lapack_funcs(('geqrf',), (a,))
1126
+ qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True)
1127
+ qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True)
1128
+ del a # free up mem
1129
+ assert info >= 0
1130
+ r = np.triu(qr[:n, :n])
1131
+ if m < n: # rare case, #features < #topics
1132
+ qr = qr[:, :m] # retains fortran order
1133
+ gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
1134
+ q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
1135
+ q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
1136
+ assert info >= 0, "qr failed"
1137
+ assert q.flags.f_contiguous
1138
+ return q, r
1139
+
1140
+
1141
+ class MmWriter:
1142
+ """Store a corpus in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_,
1143
+ using :class:`~gensim.corpora.mmcorpus.MmCorpus`.
1144
+
1145
+ Notes
1146
+ -----
1147
+ The output is written one document at a time, not the whole matrix at once (unlike e.g. `scipy.io.mmread`).
1148
+ This allows you to write corpora which are larger than the available RAM.
1149
+
1150
+ The output file is created in a single pass through the input corpus, so that the input can be
1151
+ a once-only stream (generator).
1152
+
1153
+ To achieve this, a fake MM header is written first, corpus statistics are collected
1154
+ during the pass (shape of the matrix, number of non-zeroes), followed by a seek back to the beginning of the file,
1155
+ rewriting the fake header with the final values.
1156
+
1157
+ """
1158
+ HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format
1159
+
1160
+ def __init__(self, fname):
1161
+ """
1162
+
1163
+ Parameters
1164
+ ----------
1165
+ fname : str
1166
+ Path to output file.
1167
+
1168
+ """
1169
+ self.fname = fname
1170
+ if fname.endswith(".gz") or fname.endswith('.bz2'):
1171
+ raise NotImplementedError("compressed output not supported with MmWriter")
1172
+ self.fout = utils.open(self.fname, 'wb+') # open for both reading and writing
1173
+ self.headers_written = False
1174
+
1175
+ def write_headers(self, num_docs, num_terms, num_nnz):
1176
+ """Write headers to file.
1177
+
1178
+ Parameters
1179
+ ----------
1180
+ num_docs : int
1181
+ Number of documents in corpus.
1182
+ num_terms : int
1183
+ Number of term in corpus.
1184
+ num_nnz : int
1185
+ Number of non-zero elements in corpus.
1186
+
1187
+ """
1188
+ self.fout.write(MmWriter.HEADER_LINE)
1189
+
1190
+ if num_nnz < 0:
1191
+ # we don't know the matrix shape/density yet, so only log a general line
1192
+ logger.info("saving sparse matrix to %s", self.fname)
1193
+ self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody
1194
+ else:
1195
+ logger.info(
1196
+ "saving sparse %sx%s matrix with %i non-zero entries to %s",
1197
+ num_docs, num_terms, num_nnz, self.fname
1198
+ )
1199
+ self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
1200
+ self.last_docno = -1
1201
+ self.headers_written = True
1202
+
1203
+ def fake_headers(self, num_docs, num_terms, num_nnz):
1204
+ """Write "fake" headers to file, to be rewritten once we've scanned the entire corpus.
1205
+
1206
+ Parameters
1207
+ ----------
1208
+ num_docs : int
1209
+ Number of documents in corpus.
1210
+ num_terms : int
1211
+ Number of term in corpus.
1212
+ num_nnz : int
1213
+ Number of non-zero elements in corpus.
1214
+
1215
+ """
1216
+ stats = '%i %i %i' % (num_docs, num_terms, num_nnz)
1217
+ if len(stats) > 50:
1218
+ raise ValueError('Invalid stats: matrix too large!')
1219
+ self.fout.seek(len(MmWriter.HEADER_LINE))
1220
+ self.fout.write(utils.to_utf8(stats))
1221
+
1222
+ def write_vector(self, docno, vector):
1223
+ """Write a single sparse vector to the file.
1224
+
1225
+ Parameters
1226
+ ----------
1227
+ docno : int
1228
+ Number of document.
1229
+ vector : list of (int, number)
1230
+ Document in BoW format.
1231
+
1232
+ Returns
1233
+ -------
1234
+ (int, int)
1235
+ Max word index in vector and len of vector. If vector is empty, return (-1, 0).
1236
+
1237
+ """
1238
+ assert self.headers_written, "must write Matrix Market file headers before writing data!"
1239
+ assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno)
1240
+ vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries
1241
+ for termid, weight in vector: # write term ids in sorted order
1242
+ # +1 because MM format starts counting from 1
1243
+ self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight)))
1244
+ self.last_docno = docno
1245
+ return (vector[-1][0], len(vector)) if vector else (-1, 0)
1246
+
1247
+ @staticmethod
1248
+ def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
1249
+ """Save the corpus to disk in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_.
1250
+
1251
+ Parameters
1252
+ ----------
1253
+ fname : str
1254
+ Filename of the resulting file.
1255
+ corpus : iterable of list of (int, number)
1256
+ Corpus in streamed bag-of-words format.
1257
+ progress_cnt : int, optional
1258
+ Print progress for every `progress_cnt` number of documents.
1259
+ index : bool, optional
1260
+ Return offsets?
1261
+ num_terms : int, optional
1262
+ Number of terms in the corpus. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
1263
+ metadata : bool, optional
1264
+ Generate a metadata file?
1265
+
1266
+ Returns
1267
+ -------
1268
+ offsets : {list of int, None}
1269
+ List of offsets (if index=True) or nothing.
1270
+
1271
+ Notes
1272
+ -----
1273
+ Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM.
1274
+
1275
+ See Also
1276
+ --------
1277
+ :func:`gensim.corpora.mmcorpus.MmCorpus.save_corpus`
1278
+ Save corpus to disk.
1279
+
1280
+ """
1281
+ mw = MmWriter(fname)
1282
+
1283
+ # write empty headers to the file (with enough space to be overwritten later)
1284
+ mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line
1285
+
1286
+ # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
1287
+ _num_terms, num_nnz = 0, 0
1288
+ docno, poslast = -1, -1
1289
+ offsets = []
1290
+ if hasattr(corpus, 'metadata'):
1291
+ orig_metadata = corpus.metadata
1292
+ corpus.metadata = metadata
1293
+ if metadata:
1294
+ docno2metadata = {}
1295
+ else:
1296
+ metadata = False
1297
+ for docno, doc in enumerate(corpus):
1298
+ if metadata:
1299
+ bow, data = doc
1300
+ docno2metadata[docno] = data
1301
+ else:
1302
+ bow = doc
1303
+ if docno % progress_cnt == 0:
1304
+ logger.info("PROGRESS: saving document #%i", docno)
1305
+ if index:
1306
+ posnow = mw.fout.tell()
1307
+ if posnow == poslast:
1308
+ offsets[-1] = -1
1309
+ offsets.append(posnow)
1310
+ poslast = posnow
1311
+ max_id, veclen = mw.write_vector(docno, bow)
1312
+ _num_terms = max(_num_terms, 1 + max_id)
1313
+ num_nnz += veclen
1314
+ if metadata:
1315
+ utils.pickle(docno2metadata, fname + '.metadata.cpickle')
1316
+ corpus.metadata = orig_metadata
1317
+
1318
+ num_docs = docno + 1
1319
+ num_terms = num_terms or _num_terms
1320
+
1321
+ if num_docs * num_terms != 0:
1322
+ logger.info(
1323
+ "saved %ix%i matrix, density=%.3f%% (%i/%i)",
1324
+ num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms
1325
+ )
1326
+
1327
+ # now write proper headers, by seeking and overwriting the spaces written earlier
1328
+ mw.fake_headers(num_docs, num_terms, num_nnz)
1329
+
1330
+ mw.close()
1331
+ if index:
1332
+ return offsets
1333
+
1334
+ def __del__(self):
1335
+ """Close `self.fout` file. Alias for :meth:`~gensim.matutils.MmWriter.close`.
1336
+
1337
+ Warnings
1338
+ --------
1339
+ Closing the file explicitly via the close() method is preferred and safer.
1340
+
1341
+ """
1342
+ self.close() # does nothing if called twice (on an already closed file), so no worries
1343
+
1344
+ def close(self):
1345
+ """Close `self.fout` file."""
1346
+ logger.debug("closing %s", self.fname)
1347
+ if hasattr(self, 'fout'):
1348
+ self.fout.close()
1349
+
1350
+
1351
+ try:
1352
+ from gensim.corpora._mmreader import MmReader # noqa: F401
1353
+ except ImportError:
1354
+ raise utils.NO_CYTHON