test_unicode.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. import pytest
  2. import numpy as np
  3. from numpy.testing import assert_, assert_equal, assert_array_equal
  4. def buffer_length(arr):
  5. if isinstance(arr, str):
  6. if not arr:
  7. charmax = 0
  8. else:
  9. charmax = max([ord(c) for c in arr])
  10. if charmax < 256:
  11. size = 1
  12. elif charmax < 65536:
  13. size = 2
  14. else:
  15. size = 4
  16. return size * len(arr)
  17. v = memoryview(arr)
  18. if v.shape is None:
  19. return len(v) * v.itemsize
  20. else:
  21. return np.prod(v.shape) * v.itemsize
  22. # In both cases below we need to make sure that the byte swapped value (as
  23. # UCS4) is still a valid unicode:
  24. # Value that can be represented in UCS2 interpreters
  25. ucs2_value = '\u0900'
  26. # Value that cannot be represented in UCS2 interpreters (but can in UCS4)
  27. ucs4_value = '\U00100900'
  28. def test_string_cast():
  29. str_arr = np.array(["1234", "1234\0\0"], dtype='S')
  30. uni_arr1 = str_arr.astype('>U')
  31. uni_arr2 = str_arr.astype('<U')
  32. with pytest.warns(FutureWarning):
  33. assert str_arr != uni_arr1
  34. with pytest.warns(FutureWarning):
  35. assert str_arr != uni_arr2
  36. assert_array_equal(uni_arr1, uni_arr2)
  37. ############################################################
  38. # Creation tests
  39. ############################################################
  40. class CreateZeros:
  41. """Check the creation of zero-valued arrays"""
  42. def content_check(self, ua, ua_scalar, nbytes):
  43. # Check the length of the unicode base type
  44. assert_(int(ua.dtype.str[2:]) == self.ulen)
  45. # Check the length of the data buffer
  46. assert_(buffer_length(ua) == nbytes)
  47. # Small check that data in array element is ok
  48. assert_(ua_scalar == '')
  49. # Encode to ascii and double check
  50. assert_(ua_scalar.encode('ascii') == b'')
  51. # Check buffer lengths for scalars
  52. assert_(buffer_length(ua_scalar) == 0)
  53. def test_zeros0D(self):
  54. # Check creation of 0-dimensional objects
  55. ua = np.zeros((), dtype='U%s' % self.ulen)
  56. self.content_check(ua, ua[()], 4*self.ulen)
  57. def test_zerosSD(self):
  58. # Check creation of single-dimensional objects
  59. ua = np.zeros((2,), dtype='U%s' % self.ulen)
  60. self.content_check(ua, ua[0], 4*self.ulen*2)
  61. self.content_check(ua, ua[1], 4*self.ulen*2)
  62. def test_zerosMD(self):
  63. # Check creation of multi-dimensional objects
  64. ua = np.zeros((2, 3, 4), dtype='U%s' % self.ulen)
  65. self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
  66. self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
  67. class TestCreateZeros_1(CreateZeros):
  68. """Check the creation of zero-valued arrays (size 1)"""
  69. ulen = 1
  70. class TestCreateZeros_2(CreateZeros):
  71. """Check the creation of zero-valued arrays (size 2)"""
  72. ulen = 2
  73. class TestCreateZeros_1009(CreateZeros):
  74. """Check the creation of zero-valued arrays (size 1009)"""
  75. ulen = 1009
  76. class CreateValues:
  77. """Check the creation of unicode arrays with values"""
  78. def content_check(self, ua, ua_scalar, nbytes):
  79. # Check the length of the unicode base type
  80. assert_(int(ua.dtype.str[2:]) == self.ulen)
  81. # Check the length of the data buffer
  82. assert_(buffer_length(ua) == nbytes)
  83. # Small check that data in array element is ok
  84. assert_(ua_scalar == self.ucs_value*self.ulen)
  85. # Encode to UTF-8 and double check
  86. assert_(ua_scalar.encode('utf-8') ==
  87. (self.ucs_value*self.ulen).encode('utf-8'))
  88. # Check buffer lengths for scalars
  89. if self.ucs_value == ucs4_value:
  90. # In UCS2, the \U0010FFFF will be represented using a
  91. # surrogate *pair*
  92. assert_(buffer_length(ua_scalar) == 2*2*self.ulen)
  93. else:
  94. # In UCS2, the \uFFFF will be represented using a
  95. # regular 2-byte word
  96. assert_(buffer_length(ua_scalar) == 2*self.ulen)
  97. def test_values0D(self):
  98. # Check creation of 0-dimensional objects with values
  99. ua = np.array(self.ucs_value*self.ulen, dtype='U%s' % self.ulen)
  100. self.content_check(ua, ua[()], 4*self.ulen)
  101. def test_valuesSD(self):
  102. # Check creation of single-dimensional objects with values
  103. ua = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  104. self.content_check(ua, ua[0], 4*self.ulen*2)
  105. self.content_check(ua, ua[1], 4*self.ulen*2)
  106. def test_valuesMD(self):
  107. # Check creation of multi-dimensional objects with values
  108. ua = np.array([[[self.ucs_value*self.ulen]*2]*3]*4, dtype='U%s' % self.ulen)
  109. self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
  110. self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
  111. class TestCreateValues_1_UCS2(CreateValues):
  112. """Check the creation of valued arrays (size 1, UCS2 values)"""
  113. ulen = 1
  114. ucs_value = ucs2_value
  115. class TestCreateValues_1_UCS4(CreateValues):
  116. """Check the creation of valued arrays (size 1, UCS4 values)"""
  117. ulen = 1
  118. ucs_value = ucs4_value
  119. class TestCreateValues_2_UCS2(CreateValues):
  120. """Check the creation of valued arrays (size 2, UCS2 values)"""
  121. ulen = 2
  122. ucs_value = ucs2_value
  123. class TestCreateValues_2_UCS4(CreateValues):
  124. """Check the creation of valued arrays (size 2, UCS4 values)"""
  125. ulen = 2
  126. ucs_value = ucs4_value
  127. class TestCreateValues_1009_UCS2(CreateValues):
  128. """Check the creation of valued arrays (size 1009, UCS2 values)"""
  129. ulen = 1009
  130. ucs_value = ucs2_value
  131. class TestCreateValues_1009_UCS4(CreateValues):
  132. """Check the creation of valued arrays (size 1009, UCS4 values)"""
  133. ulen = 1009
  134. ucs_value = ucs4_value
  135. ############################################################
  136. # Assignment tests
  137. ############################################################
  138. class AssignValues:
  139. """Check the assignment of unicode arrays with values"""
  140. def content_check(self, ua, ua_scalar, nbytes):
  141. # Check the length of the unicode base type
  142. assert_(int(ua.dtype.str[2:]) == self.ulen)
  143. # Check the length of the data buffer
  144. assert_(buffer_length(ua) == nbytes)
  145. # Small check that data in array element is ok
  146. assert_(ua_scalar == self.ucs_value*self.ulen)
  147. # Encode to UTF-8 and double check
  148. assert_(ua_scalar.encode('utf-8') ==
  149. (self.ucs_value*self.ulen).encode('utf-8'))
  150. # Check buffer lengths for scalars
  151. if self.ucs_value == ucs4_value:
  152. # In UCS2, the \U0010FFFF will be represented using a
  153. # surrogate *pair*
  154. assert_(buffer_length(ua_scalar) == 2*2*self.ulen)
  155. else:
  156. # In UCS2, the \uFFFF will be represented using a
  157. # regular 2-byte word
  158. assert_(buffer_length(ua_scalar) == 2*self.ulen)
  159. def test_values0D(self):
  160. # Check assignment of 0-dimensional objects with values
  161. ua = np.zeros((), dtype='U%s' % self.ulen)
  162. ua[()] = self.ucs_value*self.ulen
  163. self.content_check(ua, ua[()], 4*self.ulen)
  164. def test_valuesSD(self):
  165. # Check assignment of single-dimensional objects with values
  166. ua = np.zeros((2,), dtype='U%s' % self.ulen)
  167. ua[0] = self.ucs_value*self.ulen
  168. self.content_check(ua, ua[0], 4*self.ulen*2)
  169. ua[1] = self.ucs_value*self.ulen
  170. self.content_check(ua, ua[1], 4*self.ulen*2)
  171. def test_valuesMD(self):
  172. # Check assignment of multi-dimensional objects with values
  173. ua = np.zeros((2, 3, 4), dtype='U%s' % self.ulen)
  174. ua[0, 0, 0] = self.ucs_value*self.ulen
  175. self.content_check(ua, ua[0, 0, 0], 4*self.ulen*2*3*4)
  176. ua[-1, -1, -1] = self.ucs_value*self.ulen
  177. self.content_check(ua, ua[-1, -1, -1], 4*self.ulen*2*3*4)
  178. class TestAssignValues_1_UCS2(AssignValues):
  179. """Check the assignment of valued arrays (size 1, UCS2 values)"""
  180. ulen = 1
  181. ucs_value = ucs2_value
  182. class TestAssignValues_1_UCS4(AssignValues):
  183. """Check the assignment of valued arrays (size 1, UCS4 values)"""
  184. ulen = 1
  185. ucs_value = ucs4_value
  186. class TestAssignValues_2_UCS2(AssignValues):
  187. """Check the assignment of valued arrays (size 2, UCS2 values)"""
  188. ulen = 2
  189. ucs_value = ucs2_value
  190. class TestAssignValues_2_UCS4(AssignValues):
  191. """Check the assignment of valued arrays (size 2, UCS4 values)"""
  192. ulen = 2
  193. ucs_value = ucs4_value
  194. class TestAssignValues_1009_UCS2(AssignValues):
  195. """Check the assignment of valued arrays (size 1009, UCS2 values)"""
  196. ulen = 1009
  197. ucs_value = ucs2_value
  198. class TestAssignValues_1009_UCS4(AssignValues):
  199. """Check the assignment of valued arrays (size 1009, UCS4 values)"""
  200. ulen = 1009
  201. ucs_value = ucs4_value
  202. ############################################################
  203. # Byteorder tests
  204. ############################################################
  205. class ByteorderValues:
  206. """Check the byteorder of unicode arrays in round-trip conversions"""
  207. def test_values0D(self):
  208. # Check byteorder of 0-dimensional objects
  209. ua = np.array(self.ucs_value*self.ulen, dtype='U%s' % self.ulen)
  210. ua2 = ua.newbyteorder()
  211. # This changes the interpretation of the data region (but not the
  212. # actual data), therefore the returned scalars are not
  213. # the same (they are byte-swapped versions of each other).
  214. assert_(ua[()] != ua2[()])
  215. ua3 = ua2.newbyteorder()
  216. # Arrays must be equal after the round-trip
  217. assert_equal(ua, ua3)
  218. def test_valuesSD(self):
  219. # Check byteorder of single-dimensional objects
  220. ua = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  221. ua2 = ua.newbyteorder()
  222. assert_((ua != ua2).all())
  223. assert_(ua[-1] != ua2[-1])
  224. ua3 = ua2.newbyteorder()
  225. # Arrays must be equal after the round-trip
  226. assert_equal(ua, ua3)
  227. def test_valuesMD(self):
  228. # Check byteorder of multi-dimensional objects
  229. ua = np.array([[[self.ucs_value*self.ulen]*2]*3]*4,
  230. dtype='U%s' % self.ulen)
  231. ua2 = ua.newbyteorder()
  232. assert_((ua != ua2).all())
  233. assert_(ua[-1, -1, -1] != ua2[-1, -1, -1])
  234. ua3 = ua2.newbyteorder()
  235. # Arrays must be equal after the round-trip
  236. assert_equal(ua, ua3)
  237. def test_values_cast(self):
  238. # Check byteorder of when casting the array for a strided and
  239. # contiguous array:
  240. test1 = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  241. test2 = np.repeat(test1, 2)[::2]
  242. for ua in (test1, test2):
  243. ua2 = ua.astype(dtype=ua.dtype.newbyteorder())
  244. assert_((ua == ua2).all())
  245. assert_(ua[-1] == ua2[-1])
  246. ua3 = ua2.astype(dtype=ua.dtype)
  247. # Arrays must be equal after the round-trip
  248. assert_equal(ua, ua3)
  249. def test_values_updowncast(self):
  250. # Check byteorder of when casting the array to a longer and shorter
  251. # string length for strided and contiguous arrays
  252. test1 = np.array([self.ucs_value*self.ulen]*2, dtype='U%s' % self.ulen)
  253. test2 = np.repeat(test1, 2)[::2]
  254. for ua in (test1, test2):
  255. # Cast to a longer type with zero padding
  256. longer_type = np.dtype('U%s' % (self.ulen+1)).newbyteorder()
  257. ua2 = ua.astype(dtype=longer_type)
  258. assert_((ua == ua2).all())
  259. assert_(ua[-1] == ua2[-1])
  260. # Cast back again with truncating:
  261. ua3 = ua2.astype(dtype=ua.dtype)
  262. # Arrays must be equal after the round-trip
  263. assert_equal(ua, ua3)
  264. class TestByteorder_1_UCS2(ByteorderValues):
  265. """Check the byteorder in unicode (size 1, UCS2 values)"""
  266. ulen = 1
  267. ucs_value = ucs2_value
  268. class TestByteorder_1_UCS4(ByteorderValues):
  269. """Check the byteorder in unicode (size 1, UCS4 values)"""
  270. ulen = 1
  271. ucs_value = ucs4_value
  272. class TestByteorder_2_UCS2(ByteorderValues):
  273. """Check the byteorder in unicode (size 2, UCS2 values)"""
  274. ulen = 2
  275. ucs_value = ucs2_value
  276. class TestByteorder_2_UCS4(ByteorderValues):
  277. """Check the byteorder in unicode (size 2, UCS4 values)"""
  278. ulen = 2
  279. ucs_value = ucs4_value
  280. class TestByteorder_1009_UCS2(ByteorderValues):
  281. """Check the byteorder in unicode (size 1009, UCS2 values)"""
  282. ulen = 1009
  283. ucs_value = ucs2_value
  284. class TestByteorder_1009_UCS4(ByteorderValues):
  285. """Check the byteorder in unicode (size 1009, UCS4 values)"""
  286. ulen = 1009
  287. ucs_value = ucs4_value