_csr.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. """Compressed Sparse Row matrix format"""
  2. __docformat__ = "restructuredtext en"
  3. __all__ = ['csr_matrix', 'isspmatrix_csr']
  4. import numpy as np
  5. from ._base import spmatrix
  6. from ._sparsetools import (csr_tocsc, csr_tobsr, csr_count_blocks,
  7. get_csr_submatrix)
  8. from ._sputils import upcast, get_index_dtype
  9. from ._compressed import _cs_matrix
  10. class csr_matrix(_cs_matrix):
  11. """
  12. Compressed Sparse Row matrix
  13. This can be instantiated in several ways:
  14. csr_matrix(D)
  15. with a dense matrix or rank-2 ndarray D
  16. csr_matrix(S)
  17. with another sparse matrix S (equivalent to S.tocsr())
  18. csr_matrix((M, N), [dtype])
  19. to construct an empty matrix with shape (M, N)
  20. dtype is optional, defaulting to dtype='d'.
  21. csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
  22. where ``data``, ``row_ind`` and ``col_ind`` satisfy the
  23. relationship ``a[row_ind[k], col_ind[k]] = data[k]``.
  24. csr_matrix((data, indices, indptr), [shape=(M, N)])
  25. is the standard CSR representation where the column indices for
  26. row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their
  27. corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``.
  28. If the shape parameter is not supplied, the matrix dimensions
  29. are inferred from the index arrays.
  30. Attributes
  31. ----------
  32. dtype : dtype
  33. Data type of the matrix
  34. shape : 2-tuple
  35. Shape of the matrix
  36. ndim : int
  37. Number of dimensions (this is always 2)
  38. nnz
  39. Number of stored values, including explicit zeros
  40. data
  41. CSR format data array of the matrix
  42. indices
  43. CSR format index array of the matrix
  44. indptr
  45. CSR format index pointer array of the matrix
  46. has_sorted_indices
  47. Whether indices are sorted
  48. Notes
  49. -----
  50. Sparse matrices can be used in arithmetic operations: they support
  51. addition, subtraction, multiplication, division, and matrix power.
  52. Advantages of the CSR format
  53. - efficient arithmetic operations CSR + CSR, CSR * CSR, etc.
  54. - efficient row slicing
  55. - fast matrix vector products
  56. Disadvantages of the CSR format
  57. - slow column slicing operations (consider CSC)
  58. - changes to the sparsity structure are expensive (consider LIL or DOK)
  59. Examples
  60. --------
  61. >>> import numpy as np
  62. >>> from scipy.sparse import csr_matrix
  63. >>> csr_matrix((3, 4), dtype=np.int8).toarray()
  64. array([[0, 0, 0, 0],
  65. [0, 0, 0, 0],
  66. [0, 0, 0, 0]], dtype=int8)
  67. >>> row = np.array([0, 0, 1, 2, 2, 2])
  68. >>> col = np.array([0, 2, 2, 0, 1, 2])
  69. >>> data = np.array([1, 2, 3, 4, 5, 6])
  70. >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
  71. array([[1, 0, 2],
  72. [0, 0, 3],
  73. [4, 5, 6]])
  74. >>> indptr = np.array([0, 2, 3, 6])
  75. >>> indices = np.array([0, 2, 2, 0, 1, 2])
  76. >>> data = np.array([1, 2, 3, 4, 5, 6])
  77. >>> csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
  78. array([[1, 0, 2],
  79. [0, 0, 3],
  80. [4, 5, 6]])
  81. Duplicate entries are summed together:
  82. >>> row = np.array([0, 1, 2, 0])
  83. >>> col = np.array([0, 1, 1, 0])
  84. >>> data = np.array([1, 2, 4, 8])
  85. >>> csr_matrix((data, (row, col)), shape=(3, 3)).toarray()
  86. array([[9, 0, 0],
  87. [0, 2, 0],
  88. [0, 4, 0]])
  89. As an example of how to construct a CSR matrix incrementally,
  90. the following snippet builds a term-document matrix from texts:
  91. >>> docs = [["hello", "world", "hello"], ["goodbye", "cruel", "world"]]
  92. >>> indptr = [0]
  93. >>> indices = []
  94. >>> data = []
  95. >>> vocabulary = {}
  96. >>> for d in docs:
  97. ... for term in d:
  98. ... index = vocabulary.setdefault(term, len(vocabulary))
  99. ... indices.append(index)
  100. ... data.append(1)
  101. ... indptr.append(len(indices))
  102. ...
  103. >>> csr_matrix((data, indices, indptr), dtype=int).toarray()
  104. array([[2, 1, 0, 0],
  105. [0, 1, 1, 1]])
  106. """
  107. format = 'csr'
  108. def transpose(self, axes=None, copy=False):
  109. if axes is not None:
  110. raise ValueError(("Sparse matrices do not support "
  111. "an 'axes' parameter because swapping "
  112. "dimensions is the only logical permutation."))
  113. M, N = self.shape
  114. return self._csc_container((self.data, self.indices,
  115. self.indptr), shape=(N, M), copy=copy)
  116. transpose.__doc__ = spmatrix.transpose.__doc__
  117. def tolil(self, copy=False):
  118. lil = self._lil_container(self.shape, dtype=self.dtype)
  119. self.sum_duplicates()
  120. ptr,ind,dat = self.indptr,self.indices,self.data
  121. rows, data = lil.rows, lil.data
  122. for n in range(self.shape[0]):
  123. start = ptr[n]
  124. end = ptr[n+1]
  125. rows[n] = ind[start:end].tolist()
  126. data[n] = dat[start:end].tolist()
  127. return lil
  128. tolil.__doc__ = spmatrix.tolil.__doc__
  129. def tocsr(self, copy=False):
  130. if copy:
  131. return self.copy()
  132. else:
  133. return self
  134. tocsr.__doc__ = spmatrix.tocsr.__doc__
  135. def tocsc(self, copy=False):
  136. idx_dtype = get_index_dtype((self.indptr, self.indices),
  137. maxval=max(self.nnz, self.shape[0]))
  138. indptr = np.empty(self.shape[1] + 1, dtype=idx_dtype)
  139. indices = np.empty(self.nnz, dtype=idx_dtype)
  140. data = np.empty(self.nnz, dtype=upcast(self.dtype))
  141. csr_tocsc(self.shape[0], self.shape[1],
  142. self.indptr.astype(idx_dtype),
  143. self.indices.astype(idx_dtype),
  144. self.data,
  145. indptr,
  146. indices,
  147. data)
  148. A = self._csc_container((data, indices, indptr), shape=self.shape)
  149. A.has_sorted_indices = True
  150. return A
  151. tocsc.__doc__ = spmatrix.tocsc.__doc__
  152. def tobsr(self, blocksize=None, copy=True):
  153. if blocksize is None:
  154. from ._spfuncs import estimate_blocksize
  155. return self.tobsr(blocksize=estimate_blocksize(self))
  156. elif blocksize == (1,1):
  157. arg1 = (self.data.reshape(-1,1,1),self.indices,self.indptr)
  158. return self._bsr_container(arg1, shape=self.shape, copy=copy)
  159. else:
  160. R,C = blocksize
  161. M,N = self.shape
  162. if R < 1 or C < 1 or M % R != 0 or N % C != 0:
  163. raise ValueError('invalid blocksize %s' % blocksize)
  164. blks = csr_count_blocks(M,N,R,C,self.indptr,self.indices)
  165. idx_dtype = get_index_dtype((self.indptr, self.indices),
  166. maxval=max(N//C, blks))
  167. indptr = np.empty(M//R+1, dtype=idx_dtype)
  168. indices = np.empty(blks, dtype=idx_dtype)
  169. data = np.zeros((blks,R,C), dtype=self.dtype)
  170. csr_tobsr(M, N, R, C,
  171. self.indptr.astype(idx_dtype),
  172. self.indices.astype(idx_dtype),
  173. self.data,
  174. indptr, indices, data.ravel())
  175. return self._bsr_container(
  176. (data, indices, indptr), shape=self.shape
  177. )
  178. tobsr.__doc__ = spmatrix.tobsr.__doc__
  179. # these functions are used by the parent class (_cs_matrix)
  180. # to remove redundancy between csc_matrix and csr_matrix
  181. def _swap(self, x):
  182. """swap the members of x if this is a column-oriented matrix
  183. """
  184. return x
  185. def __iter__(self):
  186. indptr = np.zeros(2, dtype=self.indptr.dtype)
  187. shape = (1, self.shape[1])
  188. i0 = 0
  189. for i1 in self.indptr[1:]:
  190. indptr[1] = i1 - i0
  191. indices = self.indices[i0:i1]
  192. data = self.data[i0:i1]
  193. yield self.__class__(
  194. (data, indices, indptr), shape=shape, copy=True
  195. )
  196. i0 = i1
  197. def getrow(self, i):
  198. """Returns a copy of row i of the matrix, as a (1 x n)
  199. CSR matrix (row vector).
  200. """
  201. M, N = self.shape
  202. i = int(i)
  203. if i < 0:
  204. i += M
  205. if i < 0 or i >= M:
  206. raise IndexError('index (%d) out of range' % i)
  207. indptr, indices, data = get_csr_submatrix(
  208. M, N, self.indptr, self.indices, self.data, i, i + 1, 0, N)
  209. return self.__class__((data, indices, indptr), shape=(1, N),
  210. dtype=self.dtype, copy=False)
  211. def getcol(self, i):
  212. """Returns a copy of column i of the matrix, as a (m x 1)
  213. CSR matrix (column vector).
  214. """
  215. M, N = self.shape
  216. i = int(i)
  217. if i < 0:
  218. i += N
  219. if i < 0 or i >= N:
  220. raise IndexError('index (%d) out of range' % i)
  221. indptr, indices, data = get_csr_submatrix(
  222. M, N, self.indptr, self.indices, self.data, 0, M, i, i + 1)
  223. return self.__class__((data, indices, indptr), shape=(M, 1),
  224. dtype=self.dtype, copy=False)
  225. def _get_intXarray(self, row, col):
  226. return self.getrow(row)._minor_index_fancy(col)
  227. def _get_intXslice(self, row, col):
  228. if col.step in (1, None):
  229. return self._get_submatrix(row, col, copy=True)
  230. # TODO: uncomment this once it's faster:
  231. # return self.getrow(row)._minor_slice(col)
  232. M, N = self.shape
  233. start, stop, stride = col.indices(N)
  234. ii, jj = self.indptr[row:row+2]
  235. row_indices = self.indices[ii:jj]
  236. row_data = self.data[ii:jj]
  237. if stride > 0:
  238. ind = (row_indices >= start) & (row_indices < stop)
  239. else:
  240. ind = (row_indices <= start) & (row_indices > stop)
  241. if abs(stride) > 1:
  242. ind &= (row_indices - start) % stride == 0
  243. row_indices = (row_indices[ind] - start) // stride
  244. row_data = row_data[ind]
  245. row_indptr = np.array([0, len(row_indices)])
  246. if stride < 0:
  247. row_data = row_data[::-1]
  248. row_indices = abs(row_indices[::-1])
  249. shape = (1, max(0, int(np.ceil(float(stop - start) / stride))))
  250. return self.__class__((row_data, row_indices, row_indptr), shape=shape,
  251. dtype=self.dtype, copy=False)
  252. def _get_sliceXint(self, row, col):
  253. if row.step in (1, None):
  254. return self._get_submatrix(row, col, copy=True)
  255. return self._major_slice(row)._get_submatrix(minor=col)
  256. def _get_sliceXarray(self, row, col):
  257. return self._major_slice(row)._minor_index_fancy(col)
  258. def _get_arrayXint(self, row, col):
  259. return self._major_index_fancy(row)._get_submatrix(minor=col)
  260. def _get_arrayXslice(self, row, col):
  261. if col.step not in (1, None):
  262. col = np.arange(*col.indices(self.shape[1]))
  263. return self._get_arrayXarray(row, col)
  264. return self._major_index_fancy(row)._get_submatrix(minor=col)
  265. def isspmatrix_csr(x):
  266. """Is x of csr_matrix type?
  267. Parameters
  268. ----------
  269. x
  270. object to check for being a csr matrix
  271. Returns
  272. -------
  273. bool
  274. True if x is a csr matrix, False otherwise
  275. Examples
  276. --------
  277. >>> from scipy.sparse import csr_matrix, isspmatrix_csr
  278. >>> isspmatrix_csr(csr_matrix([[5]]))
  279. True
  280. >>> from scipy.sparse import csc_matrix, csr_matrix, isspmatrix_csc
  281. >>> isspmatrix_csr(csc_matrix([[5]]))
  282. False
  283. """
  284. from ._arrays import csr_array
  285. return isinstance(x, csr_matrix) or isinstance(x, csr_array)