test_arffread.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418
  1. import datetime
  2. import os
  3. import sys
  4. from os.path import join as pjoin
  5. from io import StringIO
  6. import numpy as np
  7. from numpy.testing import (assert_array_almost_equal,
  8. assert_array_equal, assert_equal, assert_)
  9. import pytest
  10. from pytest import raises as assert_raises
  11. from scipy.io.arff import loadarff
  12. from scipy.io.arff._arffread import read_header, ParseArffError
  13. data_path = pjoin(os.path.dirname(__file__), 'data')
  14. test1 = pjoin(data_path, 'test1.arff')
  15. test2 = pjoin(data_path, 'test2.arff')
  16. test3 = pjoin(data_path, 'test3.arff')
  17. test4 = pjoin(data_path, 'test4.arff')
  18. test5 = pjoin(data_path, 'test5.arff')
  19. test6 = pjoin(data_path, 'test6.arff')
  20. test7 = pjoin(data_path, 'test7.arff')
  21. test8 = pjoin(data_path, 'test8.arff')
  22. test9 = pjoin(data_path, 'test9.arff')
  23. test10 = pjoin(data_path, 'test10.arff')
  24. test11 = pjoin(data_path, 'test11.arff')
  25. test_quoted_nominal = pjoin(data_path, 'quoted_nominal.arff')
  26. test_quoted_nominal_spaces = pjoin(data_path, 'quoted_nominal_spaces.arff')
  27. expect4_data = [(0.1, 0.2, 0.3, 0.4, 'class1'),
  28. (-0.1, -0.2, -0.3, -0.4, 'class2'),
  29. (1, 2, 3, 4, 'class3')]
  30. expected_types = ['numeric', 'numeric', 'numeric', 'numeric', 'nominal']
  31. missing = pjoin(data_path, 'missing.arff')
  32. expect_missing_raw = np.array([[1, 5], [2, 4], [np.nan, np.nan]])
  33. expect_missing = np.empty(3, [('yop', float), ('yap', float)])
  34. expect_missing['yop'] = expect_missing_raw[:, 0]
  35. expect_missing['yap'] = expect_missing_raw[:, 1]
  36. class TestData:
  37. def test1(self):
  38. # Parsing trivial file with nothing.
  39. self._test(test4)
  40. def test2(self):
  41. # Parsing trivial file with some comments in the data section.
  42. self._test(test5)
  43. def test3(self):
  44. # Parsing trivial file with nominal attribute of 1 character.
  45. self._test(test6)
  46. def test4(self):
  47. # Parsing trivial file with trailing spaces in attribute declaration.
  48. self._test(test11)
  49. def _test(self, test_file):
  50. data, meta = loadarff(test_file)
  51. for i in range(len(data)):
  52. for j in range(4):
  53. assert_array_almost_equal(expect4_data[i][j], data[i][j])
  54. assert_equal(meta.types(), expected_types)
  55. def test_filelike(self):
  56. # Test reading from file-like object (StringIO)
  57. with open(test1) as f1:
  58. data1, meta1 = loadarff(f1)
  59. with open(test1) as f2:
  60. data2, meta2 = loadarff(StringIO(f2.read()))
  61. assert_(data1 == data2)
  62. assert_(repr(meta1) == repr(meta2))
  63. def test_path(self):
  64. # Test reading from `pathlib.Path` object
  65. from pathlib import Path
  66. with open(test1) as f1:
  67. data1, meta1 = loadarff(f1)
  68. data2, meta2 = loadarff(Path(test1))
  69. assert_(data1 == data2)
  70. assert_(repr(meta1) == repr(meta2))
  71. class TestMissingData:
  72. def test_missing(self):
  73. data, meta = loadarff(missing)
  74. for i in ['yop', 'yap']:
  75. assert_array_almost_equal(data[i], expect_missing[i])
  76. class TestNoData:
  77. def test_nodata(self):
  78. # The file nodata.arff has no data in the @DATA section.
  79. # Reading it should result in an array with length 0.
  80. nodata_filename = os.path.join(data_path, 'nodata.arff')
  81. data, meta = loadarff(nodata_filename)
  82. if sys.byteorder == 'big':
  83. end = '>'
  84. else:
  85. end = '<'
  86. expected_dtype = np.dtype([('sepallength', f'{end}f8'),
  87. ('sepalwidth', f'{end}f8'),
  88. ('petallength', f'{end}f8'),
  89. ('petalwidth', f'{end}f8'),
  90. ('class', 'S15')])
  91. assert_equal(data.dtype, expected_dtype)
  92. assert_equal(data.size, 0)
  93. class TestHeader:
  94. def test_type_parsing(self):
  95. # Test parsing type of attribute from their value.
  96. with open(test2) as ofile:
  97. rel, attrs = read_header(ofile)
  98. expected = ['numeric', 'numeric', 'numeric', 'numeric', 'numeric',
  99. 'numeric', 'string', 'string', 'nominal', 'nominal']
  100. for i in range(len(attrs)):
  101. assert_(attrs[i].type_name == expected[i])
  102. def test_badtype_parsing(self):
  103. # Test parsing wrong type of attribute from their value.
  104. def badtype_read():
  105. with open(test3) as ofile:
  106. _, _ = read_header(ofile)
  107. assert_raises(ParseArffError, badtype_read)
  108. def test_fullheader1(self):
  109. # Parsing trivial header with nothing.
  110. with open(test1) as ofile:
  111. rel, attrs = read_header(ofile)
  112. # Test relation
  113. assert_(rel == 'test1')
  114. # Test numerical attributes
  115. assert_(len(attrs) == 5)
  116. for i in range(4):
  117. assert_(attrs[i].name == 'attr%d' % i)
  118. assert_(attrs[i].type_name == 'numeric')
  119. # Test nominal attribute
  120. assert_(attrs[4].name == 'class')
  121. assert_(attrs[4].values == ('class0', 'class1', 'class2', 'class3'))
  122. def test_dateheader(self):
  123. with open(test7) as ofile:
  124. rel, attrs = read_header(ofile)
  125. assert_(rel == 'test7')
  126. assert_(len(attrs) == 5)
  127. assert_(attrs[0].name == 'attr_year')
  128. assert_(attrs[0].date_format == '%Y')
  129. assert_(attrs[1].name == 'attr_month')
  130. assert_(attrs[1].date_format == '%Y-%m')
  131. assert_(attrs[2].name == 'attr_date')
  132. assert_(attrs[2].date_format == '%Y-%m-%d')
  133. assert_(attrs[3].name == 'attr_datetime_local')
  134. assert_(attrs[3].date_format == '%Y-%m-%d %H:%M')
  135. assert_(attrs[4].name == 'attr_datetime_missing')
  136. assert_(attrs[4].date_format == '%Y-%m-%d %H:%M')
  137. def test_dateheader_unsupported(self):
  138. def read_dateheader_unsupported():
  139. with open(test8) as ofile:
  140. _, _ = read_header(ofile)
  141. assert_raises(ValueError, read_dateheader_unsupported)
  142. class TestDateAttribute:
  143. def setup_method(self):
  144. self.data, self.meta = loadarff(test7)
  145. def test_year_attribute(self):
  146. expected = np.array([
  147. '1999',
  148. '2004',
  149. '1817',
  150. '2100',
  151. '2013',
  152. '1631'
  153. ], dtype='datetime64[Y]')
  154. assert_array_equal(self.data["attr_year"], expected)
  155. def test_month_attribute(self):
  156. expected = np.array([
  157. '1999-01',
  158. '2004-12',
  159. '1817-04',
  160. '2100-09',
  161. '2013-11',
  162. '1631-10'
  163. ], dtype='datetime64[M]')
  164. assert_array_equal(self.data["attr_month"], expected)
  165. def test_date_attribute(self):
  166. expected = np.array([
  167. '1999-01-31',
  168. '2004-12-01',
  169. '1817-04-28',
  170. '2100-09-10',
  171. '2013-11-30',
  172. '1631-10-15'
  173. ], dtype='datetime64[D]')
  174. assert_array_equal(self.data["attr_date"], expected)
  175. def test_datetime_local_attribute(self):
  176. expected = np.array([
  177. datetime.datetime(year=1999, month=1, day=31, hour=0, minute=1),
  178. datetime.datetime(year=2004, month=12, day=1, hour=23, minute=59),
  179. datetime.datetime(year=1817, month=4, day=28, hour=13, minute=0),
  180. datetime.datetime(year=2100, month=9, day=10, hour=12, minute=0),
  181. datetime.datetime(year=2013, month=11, day=30, hour=4, minute=55),
  182. datetime.datetime(year=1631, month=10, day=15, hour=20, minute=4)
  183. ], dtype='datetime64[m]')
  184. assert_array_equal(self.data["attr_datetime_local"], expected)
  185. def test_datetime_missing(self):
  186. expected = np.array([
  187. 'nat',
  188. '2004-12-01T23:59',
  189. 'nat',
  190. 'nat',
  191. '2013-11-30T04:55',
  192. '1631-10-15T20:04'
  193. ], dtype='datetime64[m]')
  194. assert_array_equal(self.data["attr_datetime_missing"], expected)
  195. def test_datetime_timezone(self):
  196. assert_raises(ParseArffError, loadarff, test8)
  197. class TestRelationalAttribute:
  198. def setup_method(self):
  199. self.data, self.meta = loadarff(test9)
  200. def test_attributes(self):
  201. assert_equal(len(self.meta._attributes), 1)
  202. relational = list(self.meta._attributes.values())[0]
  203. assert_equal(relational.name, 'attr_date_number')
  204. assert_equal(relational.type_name, 'relational')
  205. assert_equal(len(relational.attributes), 2)
  206. assert_equal(relational.attributes[0].name,
  207. 'attr_date')
  208. assert_equal(relational.attributes[0].type_name,
  209. 'date')
  210. assert_equal(relational.attributes[1].name,
  211. 'attr_number')
  212. assert_equal(relational.attributes[1].type_name,
  213. 'numeric')
  214. def test_data(self):
  215. dtype_instance = [('attr_date', 'datetime64[D]'),
  216. ('attr_number', np.float_)]
  217. expected = [
  218. np.array([('1999-01-31', 1), ('1935-11-27', 10)],
  219. dtype=dtype_instance),
  220. np.array([('2004-12-01', 2), ('1942-08-13', 20)],
  221. dtype=dtype_instance),
  222. np.array([('1817-04-28', 3)],
  223. dtype=dtype_instance),
  224. np.array([('2100-09-10', 4), ('1957-04-17', 40),
  225. ('1721-01-14', 400)],
  226. dtype=dtype_instance),
  227. np.array([('2013-11-30', 5)],
  228. dtype=dtype_instance),
  229. np.array([('1631-10-15', 6)],
  230. dtype=dtype_instance)
  231. ]
  232. for i in range(len(self.data["attr_date_number"])):
  233. assert_array_equal(self.data["attr_date_number"][i],
  234. expected[i])
  235. class TestRelationalAttributeLong:
  236. def setup_method(self):
  237. self.data, self.meta = loadarff(test10)
  238. def test_attributes(self):
  239. assert_equal(len(self.meta._attributes), 1)
  240. relational = list(self.meta._attributes.values())[0]
  241. assert_equal(relational.name, 'attr_relational')
  242. assert_equal(relational.type_name, 'relational')
  243. assert_equal(len(relational.attributes), 1)
  244. assert_equal(relational.attributes[0].name,
  245. 'attr_number')
  246. assert_equal(relational.attributes[0].type_name, 'numeric')
  247. def test_data(self):
  248. dtype_instance = [('attr_number', np.float_)]
  249. expected = np.array([(n,) for n in range(30000)],
  250. dtype=dtype_instance)
  251. assert_array_equal(self.data["attr_relational"][0],
  252. expected)
  253. class TestQuotedNominal:
  254. """
  255. Regression test for issue #10232 : Exception in loadarff with quoted nominal attributes.
  256. """
  257. def setup_method(self):
  258. self.data, self.meta = loadarff(test_quoted_nominal)
  259. def test_attributes(self):
  260. assert_equal(len(self.meta._attributes), 2)
  261. age, smoker = self.meta._attributes.values()
  262. assert_equal(age.name, 'age')
  263. assert_equal(age.type_name, 'numeric')
  264. assert_equal(smoker.name, 'smoker')
  265. assert_equal(smoker.type_name, 'nominal')
  266. assert_equal(smoker.values, ['yes', 'no'])
  267. def test_data(self):
  268. age_dtype_instance = np.float_
  269. smoker_dtype_instance = '<S3'
  270. age_expected = np.array([
  271. 18,
  272. 24,
  273. 44,
  274. 56,
  275. 89,
  276. 11,
  277. ], dtype=age_dtype_instance)
  278. smoker_expected = np.array([
  279. 'no',
  280. 'yes',
  281. 'no',
  282. 'no',
  283. 'yes',
  284. 'no',
  285. ], dtype=smoker_dtype_instance)
  286. assert_array_equal(self.data["age"], age_expected)
  287. assert_array_equal(self.data["smoker"], smoker_expected)
  288. class TestQuotedNominalSpaces:
  289. """
  290. Regression test for issue #10232 : Exception in loadarff with quoted nominal attributes.
  291. """
  292. def setup_method(self):
  293. self.data, self.meta = loadarff(test_quoted_nominal_spaces)
  294. def test_attributes(self):
  295. assert_equal(len(self.meta._attributes), 2)
  296. age, smoker = self.meta._attributes.values()
  297. assert_equal(age.name, 'age')
  298. assert_equal(age.type_name, 'numeric')
  299. assert_equal(smoker.name, 'smoker')
  300. assert_equal(smoker.type_name, 'nominal')
  301. assert_equal(smoker.values, [' yes', 'no '])
  302. def test_data(self):
  303. age_dtype_instance = np.float_
  304. smoker_dtype_instance = '<S5'
  305. age_expected = np.array([
  306. 18,
  307. 24,
  308. 44,
  309. 56,
  310. 89,
  311. 11,
  312. ], dtype=age_dtype_instance)
  313. smoker_expected = np.array([
  314. 'no ',
  315. ' yes',
  316. 'no ',
  317. 'no ',
  318. ' yes',
  319. 'no ',
  320. ], dtype=smoker_dtype_instance)
  321. assert_array_equal(self.data["age"], age_expected)
  322. assert_array_equal(self.data["smoker"], smoker_expected)