_coo.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614
  1. """ A sparse matrix in COOrdinate or 'triplet' format"""
  2. __docformat__ = "restructuredtext en"
  3. __all__ = ['coo_matrix', 'isspmatrix_coo']
  4. from warnings import warn
  5. import numpy as np
  6. from ._sparsetools import coo_tocsr, coo_todense, coo_matvec
  7. from ._base import isspmatrix, SparseEfficiencyWarning, spmatrix
  8. from ._data import _data_matrix, _minmax_mixin
  9. from ._sputils import (upcast, upcast_char, to_native, isshape, getdtype,
  10. getdata, get_index_dtype, downcast_intp_index,
  11. check_shape, check_reshape_kwargs)
  12. import operator
  13. class coo_matrix(_data_matrix, _minmax_mixin):
  14. """
  15. A sparse matrix in COOrdinate format.
  16. Also known as the 'ijv' or 'triplet' format.
  17. This can be instantiated in several ways:
  18. coo_matrix(D)
  19. with a dense matrix D
  20. coo_matrix(S)
  21. with another sparse matrix S (equivalent to S.tocoo())
  22. coo_matrix((M, N), [dtype])
  23. to construct an empty matrix with shape (M, N)
  24. dtype is optional, defaulting to dtype='d'.
  25. coo_matrix((data, (i, j)), [shape=(M, N)])
  26. to construct from three arrays:
  27. 1. data[:] the entries of the matrix, in any order
  28. 2. i[:] the row indices of the matrix entries
  29. 3. j[:] the column indices of the matrix entries
  30. Where ``A[i[k], j[k]] = data[k]``. When shape is not
  31. specified, it is inferred from the index arrays
  32. Attributes
  33. ----------
  34. dtype : dtype
  35. Data type of the matrix
  36. shape : 2-tuple
  37. Shape of the matrix
  38. ndim : int
  39. Number of dimensions (this is always 2)
  40. nnz
  41. Number of stored values, including explicit zeros
  42. data
  43. COO format data array of the matrix
  44. row
  45. COO format row index array of the matrix
  46. col
  47. COO format column index array of the matrix
  48. Notes
  49. -----
  50. Sparse matrices can be used in arithmetic operations: they support
  51. addition, subtraction, multiplication, division, and matrix power.
  52. Advantages of the COO format
  53. - facilitates fast conversion among sparse formats
  54. - permits duplicate entries (see example)
  55. - very fast conversion to and from CSR/CSC formats
  56. Disadvantages of the COO format
  57. - does not directly support:
  58. + arithmetic operations
  59. + slicing
  60. Intended Usage
  61. - COO is a fast format for constructing sparse matrices
  62. - Once a matrix has been constructed, convert to CSR or
  63. CSC format for fast arithmetic and matrix vector operations
  64. - By default when converting to CSR or CSC format, duplicate (i,j)
  65. entries will be summed together. This facilitates efficient
  66. construction of finite element matrices and the like. (see example)
  67. Examples
  68. --------
  69. >>> # Constructing an empty matrix
  70. >>> import numpy as np
  71. >>> from scipy.sparse import coo_matrix
  72. >>> coo_matrix((3, 4), dtype=np.int8).toarray()
  73. array([[0, 0, 0, 0],
  74. [0, 0, 0, 0],
  75. [0, 0, 0, 0]], dtype=int8)
  76. >>> # Constructing a matrix using ijv format
  77. >>> row = np.array([0, 3, 1, 0])
  78. >>> col = np.array([0, 3, 1, 2])
  79. >>> data = np.array([4, 5, 7, 9])
  80. >>> coo_matrix((data, (row, col)), shape=(4, 4)).toarray()
  81. array([[4, 0, 9, 0],
  82. [0, 7, 0, 0],
  83. [0, 0, 0, 0],
  84. [0, 0, 0, 5]])
  85. >>> # Constructing a matrix with duplicate indices
  86. >>> row = np.array([0, 0, 1, 3, 1, 0, 0])
  87. >>> col = np.array([0, 2, 1, 3, 1, 0, 0])
  88. >>> data = np.array([1, 1, 1, 1, 1, 1, 1])
  89. >>> coo = coo_matrix((data, (row, col)), shape=(4, 4))
  90. >>> # Duplicate indices are maintained until implicitly or explicitly summed
  91. >>> np.max(coo.data)
  92. 1
  93. >>> coo.toarray()
  94. array([[3, 0, 1, 0],
  95. [0, 2, 0, 0],
  96. [0, 0, 0, 0],
  97. [0, 0, 0, 1]])
  98. """
  99. format = 'coo'
  100. def __init__(self, arg1, shape=None, dtype=None, copy=False):
  101. _data_matrix.__init__(self)
  102. if isinstance(arg1, tuple):
  103. if isshape(arg1):
  104. M, N = arg1
  105. self._shape = check_shape((M, N))
  106. idx_dtype = get_index_dtype(maxval=max(M, N))
  107. data_dtype = getdtype(dtype, default=float)
  108. self.row = np.array([], dtype=idx_dtype)
  109. self.col = np.array([], dtype=idx_dtype)
  110. self.data = np.array([], dtype=data_dtype)
  111. self.has_canonical_format = True
  112. else:
  113. try:
  114. obj, (row, col) = arg1
  115. except (TypeError, ValueError) as e:
  116. raise TypeError('invalid input format') from e
  117. if shape is None:
  118. if len(row) == 0 or len(col) == 0:
  119. raise ValueError('cannot infer dimensions from zero '
  120. 'sized index arrays')
  121. M = operator.index(np.max(row)) + 1
  122. N = operator.index(np.max(col)) + 1
  123. self._shape = check_shape((M, N))
  124. else:
  125. # Use 2 steps to ensure shape has length 2.
  126. M, N = shape
  127. self._shape = check_shape((M, N))
  128. idx_dtype = get_index_dtype(maxval=max(self.shape))
  129. self.row = np.array(row, copy=copy, dtype=idx_dtype)
  130. self.col = np.array(col, copy=copy, dtype=idx_dtype)
  131. self.data = getdata(obj, copy=copy, dtype=dtype)
  132. self.has_canonical_format = False
  133. else:
  134. if isspmatrix(arg1):
  135. if isspmatrix_coo(arg1) and copy:
  136. self.row = arg1.row.copy()
  137. self.col = arg1.col.copy()
  138. self.data = arg1.data.copy()
  139. self._shape = check_shape(arg1.shape)
  140. else:
  141. coo = arg1.tocoo()
  142. self.row = coo.row
  143. self.col = coo.col
  144. self.data = coo.data
  145. self._shape = check_shape(coo.shape)
  146. self.has_canonical_format = False
  147. else:
  148. #dense argument
  149. M = np.atleast_2d(np.asarray(arg1))
  150. if M.ndim != 2:
  151. raise TypeError('expected dimension <= 2 array or matrix')
  152. self._shape = check_shape(M.shape)
  153. if shape is not None:
  154. if check_shape(shape) != self._shape:
  155. raise ValueError('inconsistent shapes: %s != %s' %
  156. (shape, self._shape))
  157. self.row, self.col = M.nonzero()
  158. self.data = M[self.row, self.col]
  159. self.has_canonical_format = True
  160. if dtype is not None:
  161. self.data = self.data.astype(dtype, copy=False)
  162. self._check()
  163. def reshape(self, *args, **kwargs):
  164. shape = check_shape(args, self.shape)
  165. order, copy = check_reshape_kwargs(kwargs)
  166. # Return early if reshape is not required
  167. if shape == self.shape:
  168. if copy:
  169. return self.copy()
  170. else:
  171. return self
  172. nrows, ncols = self.shape
  173. if order == 'C':
  174. # Upcast to avoid overflows: the coo_matrix constructor
  175. # below will downcast the results to a smaller dtype, if
  176. # possible.
  177. dtype = get_index_dtype(maxval=(ncols * max(0, nrows - 1) + max(0, ncols - 1)))
  178. flat_indices = np.multiply(ncols, self.row, dtype=dtype) + self.col
  179. new_row, new_col = divmod(flat_indices, shape[1])
  180. elif order == 'F':
  181. dtype = get_index_dtype(maxval=(nrows * max(0, ncols - 1) + max(0, nrows - 1)))
  182. flat_indices = np.multiply(nrows, self.col, dtype=dtype) + self.row
  183. new_col, new_row = divmod(flat_indices, shape[0])
  184. else:
  185. raise ValueError("'order' must be 'C' or 'F'")
  186. # Handle copy here rather than passing on to the constructor so that no
  187. # copy will be made of new_row and new_col regardless
  188. if copy:
  189. new_data = self.data.copy()
  190. else:
  191. new_data = self.data
  192. return self.__class__((new_data, (new_row, new_col)),
  193. shape=shape, copy=False)
  194. reshape.__doc__ = spmatrix.reshape.__doc__
  195. def getnnz(self, axis=None):
  196. if axis is None:
  197. nnz = len(self.data)
  198. if nnz != len(self.row) or nnz != len(self.col):
  199. raise ValueError('row, column, and data array must all be the '
  200. 'same length')
  201. if self.data.ndim != 1 or self.row.ndim != 1 or \
  202. self.col.ndim != 1:
  203. raise ValueError('row, column, and data arrays must be 1-D')
  204. return int(nnz)
  205. if axis < 0:
  206. axis += 2
  207. if axis == 0:
  208. return np.bincount(downcast_intp_index(self.col),
  209. minlength=self.shape[1])
  210. elif axis == 1:
  211. return np.bincount(downcast_intp_index(self.row),
  212. minlength=self.shape[0])
  213. else:
  214. raise ValueError('axis out of bounds')
  215. getnnz.__doc__ = spmatrix.getnnz.__doc__
  216. def _check(self):
  217. """ Checks data structure for consistency """
  218. # index arrays should have integer data types
  219. if self.row.dtype.kind != 'i':
  220. warn("row index array has non-integer dtype (%s) "
  221. % self.row.dtype.name)
  222. if self.col.dtype.kind != 'i':
  223. warn("col index array has non-integer dtype (%s) "
  224. % self.col.dtype.name)
  225. idx_dtype = get_index_dtype(maxval=max(self.shape))
  226. self.row = np.asarray(self.row, dtype=idx_dtype)
  227. self.col = np.asarray(self.col, dtype=idx_dtype)
  228. self.data = to_native(self.data)
  229. if self.nnz > 0:
  230. if self.row.max() >= self.shape[0]:
  231. raise ValueError('row index exceeds matrix dimensions')
  232. if self.col.max() >= self.shape[1]:
  233. raise ValueError('column index exceeds matrix dimensions')
  234. if self.row.min() < 0:
  235. raise ValueError('negative row index found')
  236. if self.col.min() < 0:
  237. raise ValueError('negative column index found')
  238. def transpose(self, axes=None, copy=False):
  239. if axes is not None:
  240. raise ValueError(("Sparse matrices do not support "
  241. "an 'axes' parameter because swapping "
  242. "dimensions is the only logical permutation."))
  243. M, N = self.shape
  244. return self.__class__((self.data, (self.col, self.row)),
  245. shape=(N, M), copy=copy)
  246. transpose.__doc__ = spmatrix.transpose.__doc__
  247. def resize(self, *shape):
  248. shape = check_shape(shape)
  249. new_M, new_N = shape
  250. M, N = self.shape
  251. if new_M < M or new_N < N:
  252. mask = np.logical_and(self.row < new_M, self.col < new_N)
  253. if not mask.all():
  254. self.row = self.row[mask]
  255. self.col = self.col[mask]
  256. self.data = self.data[mask]
  257. self._shape = shape
  258. resize.__doc__ = spmatrix.resize.__doc__
  259. def toarray(self, order=None, out=None):
  260. """See the docstring for `spmatrix.toarray`."""
  261. B = self._process_toarray_args(order, out)
  262. fortran = int(B.flags.f_contiguous)
  263. if not fortran and not B.flags.c_contiguous:
  264. raise ValueError("Output array must be C or F contiguous")
  265. M,N = self.shape
  266. coo_todense(M, N, self.nnz, self.row, self.col, self.data,
  267. B.ravel('A'), fortran)
  268. return B
  269. def tocsc(self, copy=False):
  270. """Convert this matrix to Compressed Sparse Column format
  271. Duplicate entries will be summed together.
  272. Examples
  273. --------
  274. >>> from numpy import array
  275. >>> from scipy.sparse import coo_matrix
  276. >>> row = array([0, 0, 1, 3, 1, 0, 0])
  277. >>> col = array([0, 2, 1, 3, 1, 0, 0])
  278. >>> data = array([1, 1, 1, 1, 1, 1, 1])
  279. >>> A = coo_matrix((data, (row, col)), shape=(4, 4)).tocsc()
  280. >>> A.toarray()
  281. array([[3, 0, 1, 0],
  282. [0, 2, 0, 0],
  283. [0, 0, 0, 0],
  284. [0, 0, 0, 1]])
  285. """
  286. if self.nnz == 0:
  287. return self._csc_container(self.shape, dtype=self.dtype)
  288. else:
  289. M,N = self.shape
  290. idx_dtype = get_index_dtype((self.col, self.row),
  291. maxval=max(self.nnz, M))
  292. row = self.row.astype(idx_dtype, copy=False)
  293. col = self.col.astype(idx_dtype, copy=False)
  294. indptr = np.empty(N + 1, dtype=idx_dtype)
  295. indices = np.empty_like(row, dtype=idx_dtype)
  296. data = np.empty_like(self.data, dtype=upcast(self.dtype))
  297. coo_tocsr(N, M, self.nnz, col, row, self.data,
  298. indptr, indices, data)
  299. x = self._csc_container((data, indices, indptr), shape=self.shape)
  300. if not self.has_canonical_format:
  301. x.sum_duplicates()
  302. return x
  303. def tocsr(self, copy=False):
  304. """Convert this matrix to Compressed Sparse Row format
  305. Duplicate entries will be summed together.
  306. Examples
  307. --------
  308. >>> from numpy import array
  309. >>> from scipy.sparse import coo_matrix
  310. >>> row = array([0, 0, 1, 3, 1, 0, 0])
  311. >>> col = array([0, 2, 1, 3, 1, 0, 0])
  312. >>> data = array([1, 1, 1, 1, 1, 1, 1])
  313. >>> A = coo_matrix((data, (row, col)), shape=(4, 4)).tocsr()
  314. >>> A.toarray()
  315. array([[3, 0, 1, 0],
  316. [0, 2, 0, 0],
  317. [0, 0, 0, 0],
  318. [0, 0, 0, 1]])
  319. """
  320. if self.nnz == 0:
  321. return self._csr_container(self.shape, dtype=self.dtype)
  322. else:
  323. M,N = self.shape
  324. idx_dtype = get_index_dtype((self.row, self.col),
  325. maxval=max(self.nnz, N))
  326. row = self.row.astype(idx_dtype, copy=False)
  327. col = self.col.astype(idx_dtype, copy=False)
  328. indptr = np.empty(M + 1, dtype=idx_dtype)
  329. indices = np.empty_like(col, dtype=idx_dtype)
  330. data = np.empty_like(self.data, dtype=upcast(self.dtype))
  331. coo_tocsr(M, N, self.nnz, row, col, self.data,
  332. indptr, indices, data)
  333. x = self._csr_container((data, indices, indptr), shape=self.shape)
  334. if not self.has_canonical_format:
  335. x.sum_duplicates()
  336. return x
  337. def tocoo(self, copy=False):
  338. if copy:
  339. return self.copy()
  340. else:
  341. return self
  342. tocoo.__doc__ = spmatrix.tocoo.__doc__
  343. def todia(self, copy=False):
  344. self.sum_duplicates()
  345. ks = self.col - self.row # the diagonal for each nonzero
  346. diags, diag_idx = np.unique(ks, return_inverse=True)
  347. if len(diags) > 100:
  348. # probably undesired, should todia() have a maxdiags parameter?
  349. warn("Constructing a DIA matrix with %d diagonals "
  350. "is inefficient" % len(diags), SparseEfficiencyWarning)
  351. #initialize and fill in data array
  352. if self.data.size == 0:
  353. data = np.zeros((0, 0), dtype=self.dtype)
  354. else:
  355. data = np.zeros((len(diags), self.col.max()+1), dtype=self.dtype)
  356. data[diag_idx, self.col] = self.data
  357. return self._dia_container((data, diags), shape=self.shape)
  358. todia.__doc__ = spmatrix.todia.__doc__
  359. def todok(self, copy=False):
  360. self.sum_duplicates()
  361. dok = self._dok_container((self.shape), dtype=self.dtype)
  362. dok._update(zip(zip(self.row,self.col),self.data))
  363. return dok
  364. todok.__doc__ = spmatrix.todok.__doc__
  365. def diagonal(self, k=0):
  366. rows, cols = self.shape
  367. if k <= -rows or k >= cols:
  368. return np.empty(0, dtype=self.data.dtype)
  369. diag = np.zeros(min(rows + min(k, 0), cols - max(k, 0)),
  370. dtype=self.dtype)
  371. diag_mask = (self.row + k) == self.col
  372. if self.has_canonical_format:
  373. row = self.row[diag_mask]
  374. data = self.data[diag_mask]
  375. else:
  376. row, _, data = self._sum_duplicates(self.row[diag_mask],
  377. self.col[diag_mask],
  378. self.data[diag_mask])
  379. diag[row + min(k, 0)] = data
  380. return diag
  381. diagonal.__doc__ = _data_matrix.diagonal.__doc__
  382. def _setdiag(self, values, k):
  383. M, N = self.shape
  384. if values.ndim and not len(values):
  385. return
  386. idx_dtype = self.row.dtype
  387. # Determine which triples to keep and where to put the new ones.
  388. full_keep = self.col - self.row != k
  389. if k < 0:
  390. max_index = min(M+k, N)
  391. if values.ndim:
  392. max_index = min(max_index, len(values))
  393. keep = np.logical_or(full_keep, self.col >= max_index)
  394. new_row = np.arange(-k, -k + max_index, dtype=idx_dtype)
  395. new_col = np.arange(max_index, dtype=idx_dtype)
  396. else:
  397. max_index = min(M, N-k)
  398. if values.ndim:
  399. max_index = min(max_index, len(values))
  400. keep = np.logical_or(full_keep, self.row >= max_index)
  401. new_row = np.arange(max_index, dtype=idx_dtype)
  402. new_col = np.arange(k, k + max_index, dtype=idx_dtype)
  403. # Define the array of data consisting of the entries to be added.
  404. if values.ndim:
  405. new_data = values[:max_index]
  406. else:
  407. new_data = np.empty(max_index, dtype=self.dtype)
  408. new_data[:] = values
  409. # Update the internal structure.
  410. self.row = np.concatenate((self.row[keep], new_row))
  411. self.col = np.concatenate((self.col[keep], new_col))
  412. self.data = np.concatenate((self.data[keep], new_data))
  413. self.has_canonical_format = False
  414. # needed by _data_matrix
  415. def _with_data(self,data,copy=True):
  416. """Returns a matrix with the same sparsity structure as self,
  417. but with different data. By default the index arrays
  418. (i.e. .row and .col) are copied.
  419. """
  420. if copy:
  421. return self.__class__((data, (self.row.copy(), self.col.copy())),
  422. shape=self.shape, dtype=data.dtype)
  423. else:
  424. return self.__class__((data, (self.row, self.col)),
  425. shape=self.shape, dtype=data.dtype)
  426. def sum_duplicates(self):
  427. """Eliminate duplicate matrix entries by adding them together
  428. This is an *in place* operation
  429. """
  430. if self.has_canonical_format:
  431. return
  432. summed = self._sum_duplicates(self.row, self.col, self.data)
  433. self.row, self.col, self.data = summed
  434. self.has_canonical_format = True
  435. def _sum_duplicates(self, row, col, data):
  436. # Assumes (data, row, col) not in canonical format.
  437. if len(data) == 0:
  438. return row, col, data
  439. order = np.lexsort((row, col))
  440. row = row[order]
  441. col = col[order]
  442. data = data[order]
  443. unique_mask = ((row[1:] != row[:-1]) |
  444. (col[1:] != col[:-1]))
  445. unique_mask = np.append(True, unique_mask)
  446. row = row[unique_mask]
  447. col = col[unique_mask]
  448. unique_inds, = np.nonzero(unique_mask)
  449. data = np.add.reduceat(data, unique_inds, dtype=self.dtype)
  450. return row, col, data
  451. def eliminate_zeros(self):
  452. """Remove zero entries from the matrix
  453. This is an *in place* operation
  454. """
  455. mask = self.data != 0
  456. self.data = self.data[mask]
  457. self.row = self.row[mask]
  458. self.col = self.col[mask]
  459. #######################
  460. # Arithmetic handlers #
  461. #######################
  462. def _add_dense(self, other):
  463. if other.shape != self.shape:
  464. raise ValueError('Incompatible shapes ({} and {})'
  465. .format(self.shape, other.shape))
  466. dtype = upcast_char(self.dtype.char, other.dtype.char)
  467. result = np.array(other, dtype=dtype, copy=True)
  468. fortran = int(result.flags.f_contiguous)
  469. M, N = self.shape
  470. coo_todense(M, N, self.nnz, self.row, self.col, self.data,
  471. result.ravel('A'), fortran)
  472. return self._container(result, copy=False)
  473. def _mul_vector(self, other):
  474. #output array
  475. result = np.zeros(self.shape[0], dtype=upcast_char(self.dtype.char,
  476. other.dtype.char))
  477. coo_matvec(self.nnz, self.row, self.col, self.data, other, result)
  478. return result
  479. def _mul_multivector(self, other):
  480. result = np.zeros((other.shape[1], self.shape[0]),
  481. dtype=upcast_char(self.dtype.char, other.dtype.char))
  482. for i, col in enumerate(other.T):
  483. coo_matvec(self.nnz, self.row, self.col, self.data, col, result[i])
  484. return result.T.view(type=type(other))
  485. def isspmatrix_coo(x):
  486. """Is x of coo_matrix type?
  487. Parameters
  488. ----------
  489. x
  490. object to check for being a coo matrix
  491. Returns
  492. -------
  493. bool
  494. True if x is a coo matrix, False otherwise
  495. Examples
  496. --------
  497. >>> from scipy.sparse import coo_matrix, isspmatrix_coo
  498. >>> isspmatrix_coo(coo_matrix([[5]]))
  499. True
  500. >>> from scipy.sparse import coo_matrix, csr_matrix, isspmatrix_coo
  501. >>> isspmatrix_coo(csr_matrix([[5]]))
  502. False
  503. """
  504. from ._arrays import coo_array
  505. return isinstance(x, coo_matrix) or isinstance(x, coo_array)