_arffread.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905
  1. # Last Change: Mon Aug 20 08:00 PM 2007 J
  2. import re
  3. import datetime
  4. import numpy as np
  5. import csv
  6. import ctypes
  7. """A module to read arff files."""
  8. __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
  9. # An Arff file is basically two parts:
  10. # - header
  11. # - data
  12. #
  13. # A header has each of its components starting by @META where META is one of
  14. # the keyword (attribute of relation, for now).
  15. # TODO:
  16. # - both integer and reals are treated as numeric -> the integer info
  17. # is lost!
  18. # - Replace ValueError by ParseError or something
  19. # We know can handle the following:
  20. # - numeric and nominal attributes
  21. # - missing values for numeric attributes
  22. r_meta = re.compile(r'^\s*@')
  23. # Match a comment
  24. r_comment = re.compile(r'^%')
  25. # Match an empty line
  26. r_empty = re.compile(r'^\s+$')
  27. # Match a header line, that is a line which starts by @ + a word
  28. r_headerline = re.compile(r'^\s*@\S*')
  29. r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
  30. r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
  31. r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
  32. r_nominal = re.compile(r'{(.+)}')
  33. r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
  34. # To get attributes name enclosed with ''
  35. r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
  36. # To get normal attributes
  37. r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
  38. # ------------------------
  39. # Module defined exception
  40. # ------------------------
  41. class ArffError(OSError):
  42. pass
  43. class ParseArffError(ArffError):
  44. pass
  45. # ----------
  46. # Attributes
  47. # ----------
  48. class Attribute:
  49. type_name = None
  50. def __init__(self, name):
  51. self.name = name
  52. self.range = None
  53. self.dtype = np.object_
  54. @classmethod
  55. def parse_attribute(cls, name, attr_string):
  56. """
  57. Parse the attribute line if it knows how. Returns the parsed
  58. attribute, or None.
  59. """
  60. return None
  61. def parse_data(self, data_str):
  62. """
  63. Parse a value of this type.
  64. """
  65. return None
  66. def __str__(self):
  67. """
  68. Parse a value of this type.
  69. """
  70. return self.name + ',' + self.type_name
  71. class NominalAttribute(Attribute):
  72. type_name = 'nominal'
  73. def __init__(self, name, values):
  74. super().__init__(name)
  75. self.values = values
  76. self.range = values
  77. self.dtype = (np.string_, max(len(i) for i in values))
  78. @staticmethod
  79. def _get_nom_val(atrv):
  80. """Given a string containing a nominal type, returns a tuple of the
  81. possible values.
  82. A nominal type is defined as something framed between braces ({}).
  83. Parameters
  84. ----------
  85. atrv : str
  86. Nominal type definition
  87. Returns
  88. -------
  89. poss_vals : tuple
  90. possible values
  91. Examples
  92. --------
  93. >>> get_nom_val("{floup, bouga, fl, ratata}")
  94. ('floup', 'bouga', 'fl', 'ratata')
  95. """
  96. m = r_nominal.match(atrv)
  97. if m:
  98. attrs, _ = split_data_line(m.group(1))
  99. return tuple(attrs)
  100. else:
  101. raise ValueError("This does not look like a nominal string")
  102. @classmethod
  103. def parse_attribute(cls, name, attr_string):
  104. """
  105. Parse the attribute line if it knows how. Returns the parsed
  106. attribute, or None.
  107. For nominal attributes, the attribute string would be like '{<attr_1>,
  108. <attr2>, <attr_3>}'.
  109. """
  110. if attr_string[0] == '{':
  111. values = cls._get_nom_val(attr_string)
  112. return cls(name, values)
  113. else:
  114. return None
  115. def parse_data(self, data_str):
  116. """
  117. Parse a value of this type.
  118. """
  119. if data_str in self.values:
  120. return data_str
  121. elif data_str == '?':
  122. return data_str
  123. else:
  124. raise ValueError("%s value not in %s" % (str(data_str),
  125. str(self.values)))
  126. def __str__(self):
  127. msg = self.name + ",{"
  128. for i in range(len(self.values)-1):
  129. msg += self.values[i] + ","
  130. msg += self.values[-1]
  131. msg += "}"
  132. return msg
  133. class NumericAttribute(Attribute):
  134. def __init__(self, name):
  135. super().__init__(name)
  136. self.type_name = 'numeric'
  137. self.dtype = np.float_
  138. @classmethod
  139. def parse_attribute(cls, name, attr_string):
  140. """
  141. Parse the attribute line if it knows how. Returns the parsed
  142. attribute, or None.
  143. For numeric attributes, the attribute string would be like
  144. 'numeric' or 'int' or 'real'.
  145. """
  146. attr_string = attr_string.lower().strip()
  147. if (attr_string[:len('numeric')] == 'numeric' or
  148. attr_string[:len('int')] == 'int' or
  149. attr_string[:len('real')] == 'real'):
  150. return cls(name)
  151. else:
  152. return None
  153. def parse_data(self, data_str):
  154. """
  155. Parse a value of this type.
  156. Parameters
  157. ----------
  158. data_str : str
  159. string to convert
  160. Returns
  161. -------
  162. f : float
  163. where float can be nan
  164. Examples
  165. --------
  166. >>> atr = NumericAttribute('atr')
  167. >>> atr.parse_data('1')
  168. 1.0
  169. >>> atr.parse_data('1\\n')
  170. 1.0
  171. >>> atr.parse_data('?\\n')
  172. nan
  173. """
  174. if '?' in data_str:
  175. return np.nan
  176. else:
  177. return float(data_str)
  178. def _basic_stats(self, data):
  179. nbfac = data.size * 1. / (data.size - 1)
  180. return (np.nanmin(data), np.nanmax(data),
  181. np.mean(data), np.std(data) * nbfac)
  182. class StringAttribute(Attribute):
  183. def __init__(self, name):
  184. super().__init__(name)
  185. self.type_name = 'string'
  186. @classmethod
  187. def parse_attribute(cls, name, attr_string):
  188. """
  189. Parse the attribute line if it knows how. Returns the parsed
  190. attribute, or None.
  191. For string attributes, the attribute string would be like
  192. 'string'.
  193. """
  194. attr_string = attr_string.lower().strip()
  195. if attr_string[:len('string')] == 'string':
  196. return cls(name)
  197. else:
  198. return None
  199. class DateAttribute(Attribute):
  200. def __init__(self, name, date_format, datetime_unit):
  201. super().__init__(name)
  202. self.date_format = date_format
  203. self.datetime_unit = datetime_unit
  204. self.type_name = 'date'
  205. self.range = date_format
  206. self.dtype = np.datetime64(0, self.datetime_unit)
  207. @staticmethod
  208. def _get_date_format(atrv):
  209. m = r_date.match(atrv)
  210. if m:
  211. pattern = m.group(1).strip()
  212. # convert time pattern from Java's SimpleDateFormat to C's format
  213. datetime_unit = None
  214. if "yyyy" in pattern:
  215. pattern = pattern.replace("yyyy", "%Y")
  216. datetime_unit = "Y"
  217. elif "yy":
  218. pattern = pattern.replace("yy", "%y")
  219. datetime_unit = "Y"
  220. if "MM" in pattern:
  221. pattern = pattern.replace("MM", "%m")
  222. datetime_unit = "M"
  223. if "dd" in pattern:
  224. pattern = pattern.replace("dd", "%d")
  225. datetime_unit = "D"
  226. if "HH" in pattern:
  227. pattern = pattern.replace("HH", "%H")
  228. datetime_unit = "h"
  229. if "mm" in pattern:
  230. pattern = pattern.replace("mm", "%M")
  231. datetime_unit = "m"
  232. if "ss" in pattern:
  233. pattern = pattern.replace("ss", "%S")
  234. datetime_unit = "s"
  235. if "z" in pattern or "Z" in pattern:
  236. raise ValueError("Date type attributes with time zone not "
  237. "supported, yet")
  238. if datetime_unit is None:
  239. raise ValueError("Invalid or unsupported date format")
  240. return pattern, datetime_unit
  241. else:
  242. raise ValueError("Invalid or no date format")
  243. @classmethod
  244. def parse_attribute(cls, name, attr_string):
  245. """
  246. Parse the attribute line if it knows how. Returns the parsed
  247. attribute, or None.
  248. For date attributes, the attribute string would be like
  249. 'date <format>'.
  250. """
  251. attr_string_lower = attr_string.lower().strip()
  252. if attr_string_lower[:len('date')] == 'date':
  253. date_format, datetime_unit = cls._get_date_format(attr_string)
  254. return cls(name, date_format, datetime_unit)
  255. else:
  256. return None
  257. def parse_data(self, data_str):
  258. """
  259. Parse a value of this type.
  260. """
  261. date_str = data_str.strip().strip("'").strip('"')
  262. if date_str == '?':
  263. return np.datetime64('NaT', self.datetime_unit)
  264. else:
  265. dt = datetime.datetime.strptime(date_str, self.date_format)
  266. return np.datetime64(dt).astype(
  267. "datetime64[%s]" % self.datetime_unit)
  268. def __str__(self):
  269. return super().__str__() + ',' + self.date_format
  270. class RelationalAttribute(Attribute):
  271. def __init__(self, name):
  272. super().__init__(name)
  273. self.type_name = 'relational'
  274. self.dtype = np.object_
  275. self.attributes = []
  276. self.dialect = None
  277. @classmethod
  278. def parse_attribute(cls, name, attr_string):
  279. """
  280. Parse the attribute line if it knows how. Returns the parsed
  281. attribute, or None.
  282. For date attributes, the attribute string would be like
  283. 'date <format>'.
  284. """
  285. attr_string_lower = attr_string.lower().strip()
  286. if attr_string_lower[:len('relational')] == 'relational':
  287. return cls(name)
  288. else:
  289. return None
  290. def parse_data(self, data_str):
  291. # Copy-pasted
  292. elems = list(range(len(self.attributes)))
  293. escaped_string = data_str.encode().decode("unicode-escape")
  294. row_tuples = []
  295. for raw in escaped_string.split("\n"):
  296. row, self.dialect = split_data_line(raw, self.dialect)
  297. row_tuples.append(tuple(
  298. [self.attributes[i].parse_data(row[i]) for i in elems]))
  299. return np.array(row_tuples,
  300. [(a.name, a.dtype) for a in self.attributes])
  301. def __str__(self):
  302. return (super().__str__() + '\n\t' +
  303. '\n\t'.join(str(a) for a in self.attributes))
  304. # -----------------
  305. # Various utilities
  306. # -----------------
  307. def to_attribute(name, attr_string):
  308. attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
  309. StringAttribute, RelationalAttribute)
  310. for cls in attr_classes:
  311. attr = cls.parse_attribute(name, attr_string)
  312. if attr is not None:
  313. return attr
  314. raise ParseArffError("unknown attribute %s" % attr_string)
  315. def csv_sniffer_has_bug_last_field():
  316. """
  317. Checks if the bug https://bugs.python.org/issue30157 is unpatched.
  318. """
  319. # We only compute this once.
  320. has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
  321. if has_bug is None:
  322. dialect = csv.Sniffer().sniff("3, 'a'")
  323. csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
  324. has_bug = csv_sniffer_has_bug_last_field.has_bug
  325. return has_bug
  326. def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
  327. """
  328. Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
  329. """
  330. if csv_sniffer_has_bug_last_field():
  331. # Reuses code from the csv module
  332. right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'
  333. for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
  334. r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?",
  335. right_regex, # ,".*?"
  336. r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
  337. regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
  338. matches = regexp.findall(sniff_line)
  339. if matches:
  340. break
  341. # If it does not match the expression that was bugged, then this bug does not apply
  342. if restr != right_regex:
  343. return
  344. groupindex = regexp.groupindex
  345. # There is only one end of the string
  346. assert len(matches) == 1
  347. m = matches[0]
  348. n = groupindex['quote'] - 1
  349. quote = m[n]
  350. n = groupindex['delim'] - 1
  351. delim = m[n]
  352. n = groupindex['space'] - 1
  353. space = bool(m[n])
  354. dq_regexp = re.compile(
  355. r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" %
  356. {'delim': re.escape(delim), 'quote': quote}, re.MULTILINE
  357. )
  358. doublequote = bool(dq_regexp.search(sniff_line))
  359. dialect.quotechar = quote
  360. if delim in delimiters:
  361. dialect.delimiter = delim
  362. dialect.doublequote = doublequote
  363. dialect.skipinitialspace = space
  364. def split_data_line(line, dialect=None):
  365. delimiters = ",\t"
  366. # This can not be done in a per reader basis, and relational fields
  367. # can be HUGE
  368. csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
  369. # Remove the line end if any
  370. if line[-1] == '\n':
  371. line = line[:-1]
  372. # Remove potential trailing whitespace
  373. line = line.strip()
  374. sniff_line = line
  375. # Add a delimiter if none is present, so that the csv.Sniffer
  376. # does not complain for a single-field CSV.
  377. if not any(d in line for d in delimiters):
  378. sniff_line += ","
  379. if dialect is None:
  380. dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
  381. workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
  382. dialect=dialect,
  383. delimiters=delimiters)
  384. row = next(csv.reader([line], dialect))
  385. return row, dialect
  386. # --------------
  387. # Parsing header
  388. # --------------
  389. def tokenize_attribute(iterable, attribute):
  390. """Parse a raw string in header (e.g., starts by @attribute).
  391. Given a raw string attribute, try to get the name and type of the
  392. attribute. Constraints:
  393. * The first line must start with @attribute (case insensitive, and
  394. space like characters before @attribute are allowed)
  395. * Works also if the attribute is spread on multilines.
  396. * Works if empty lines or comments are in between
  397. Parameters
  398. ----------
  399. attribute : str
  400. the attribute string.
  401. Returns
  402. -------
  403. name : str
  404. name of the attribute
  405. value : str
  406. value of the attribute
  407. next : str
  408. next line to be parsed
  409. Examples
  410. --------
  411. If attribute is a string defined in python as r"floupi real", will
  412. return floupi as name, and real as value.
  413. >>> iterable = iter([0] * 10) # dummy iterator
  414. >>> tokenize_attribute(iterable, r"@attribute floupi real")
  415. ('floupi', 'real', 0)
  416. If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
  417. and real as value.
  418. >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
  419. ('floupi 2', 'real', 0)
  420. """
  421. sattr = attribute.strip()
  422. mattr = r_attribute.match(sattr)
  423. if mattr:
  424. # atrv is everything after @attribute
  425. atrv = mattr.group(1)
  426. if r_comattrval.match(atrv):
  427. name, type = tokenize_single_comma(atrv)
  428. next_item = next(iterable)
  429. elif r_wcomattrval.match(atrv):
  430. name, type = tokenize_single_wcomma(atrv)
  431. next_item = next(iterable)
  432. else:
  433. # Not sure we should support this, as it does not seem supported by
  434. # weka.
  435. raise ValueError("multi line not supported yet")
  436. else:
  437. raise ValueError("First line unparsable: %s" % sattr)
  438. attribute = to_attribute(name, type)
  439. if type.lower() == 'relational':
  440. next_item = read_relational_attribute(iterable, attribute, next_item)
  441. # raise ValueError("relational attributes not supported yet")
  442. return attribute, next_item
  443. def tokenize_single_comma(val):
  444. # XXX we match twice the same string (here and at the caller level). It is
  445. # stupid, but it is easier for now...
  446. m = r_comattrval.match(val)
  447. if m:
  448. try:
  449. name = m.group(1).strip()
  450. type = m.group(2).strip()
  451. except IndexError as e:
  452. raise ValueError("Error while tokenizing attribute") from e
  453. else:
  454. raise ValueError("Error while tokenizing single %s" % val)
  455. return name, type
  456. def tokenize_single_wcomma(val):
  457. # XXX we match twice the same string (here and at the caller level). It is
  458. # stupid, but it is easier for now...
  459. m = r_wcomattrval.match(val)
  460. if m:
  461. try:
  462. name = m.group(1).strip()
  463. type = m.group(2).strip()
  464. except IndexError as e:
  465. raise ValueError("Error while tokenizing attribute") from e
  466. else:
  467. raise ValueError("Error while tokenizing single %s" % val)
  468. return name, type
  469. def read_relational_attribute(ofile, relational_attribute, i):
  470. """Read the nested attributes of a relational attribute"""
  471. r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
  472. relational_attribute.name + r'\s*$')
  473. while not r_end_relational.match(i):
  474. m = r_headerline.match(i)
  475. if m:
  476. isattr = r_attribute.match(i)
  477. if isattr:
  478. attr, i = tokenize_attribute(ofile, i)
  479. relational_attribute.attributes.append(attr)
  480. else:
  481. raise ValueError("Error parsing line %s" % i)
  482. else:
  483. i = next(ofile)
  484. i = next(ofile)
  485. return i
  486. def read_header(ofile):
  487. """Read the header of the iterable ofile."""
  488. i = next(ofile)
  489. # Pass first comments
  490. while r_comment.match(i):
  491. i = next(ofile)
  492. # Header is everything up to DATA attribute ?
  493. relation = None
  494. attributes = []
  495. while not r_datameta.match(i):
  496. m = r_headerline.match(i)
  497. if m:
  498. isattr = r_attribute.match(i)
  499. if isattr:
  500. attr, i = tokenize_attribute(ofile, i)
  501. attributes.append(attr)
  502. else:
  503. isrel = r_relation.match(i)
  504. if isrel:
  505. relation = isrel.group(1)
  506. else:
  507. raise ValueError("Error parsing line %s" % i)
  508. i = next(ofile)
  509. else:
  510. i = next(ofile)
  511. return relation, attributes
  512. class MetaData:
  513. """Small container to keep useful information on a ARFF dataset.
  514. Knows about attributes names and types.
  515. Examples
  516. --------
  517. ::
  518. data, meta = loadarff('iris.arff')
  519. # This will print the attributes names of the iris.arff dataset
  520. for i in meta:
  521. print(i)
  522. # This works too
  523. meta.names()
  524. # Getting attribute type
  525. types = meta.types()
  526. Methods
  527. -------
  528. names
  529. types
  530. Notes
  531. -----
  532. Also maintains the list of attributes in order, i.e., doing for i in
  533. meta, where meta is an instance of MetaData, will return the
  534. different attribute names in the order they were defined.
  535. """
  536. def __init__(self, rel, attr):
  537. self.name = rel
  538. self._attributes = {a.name: a for a in attr}
  539. def __repr__(self):
  540. msg = ""
  541. msg += "Dataset: %s\n" % self.name
  542. for i in self._attributes:
  543. msg += "\t%s's type is %s" % (i, self._attributes[i].type_name)
  544. if self._attributes[i].range:
  545. msg += ", range is %s" % str(self._attributes[i].range)
  546. msg += '\n'
  547. return msg
  548. def __iter__(self):
  549. return iter(self._attributes)
  550. def __getitem__(self, key):
  551. attr = self._attributes[key]
  552. return (attr.type_name, attr.range)
  553. def names(self):
  554. """Return the list of attribute names.
  555. Returns
  556. -------
  557. attrnames : list of str
  558. The attribute names.
  559. """
  560. return list(self._attributes)
  561. def types(self):
  562. """Return the list of attribute types.
  563. Returns
  564. -------
  565. attr_types : list of str
  566. The attribute types.
  567. """
  568. attr_types = [self._attributes[name].type_name
  569. for name in self._attributes]
  570. return attr_types
  571. def loadarff(f):
  572. """
  573. Read an arff file.
  574. The data is returned as a record array, which can be accessed much like
  575. a dictionary of NumPy arrays. For example, if one of the attributes is
  576. called 'pressure', then its first 10 data points can be accessed from the
  577. ``data`` record array like so: ``data['pressure'][0:10]``
  578. Parameters
  579. ----------
  580. f : file-like or str
  581. File-like object to read from, or filename to open.
  582. Returns
  583. -------
  584. data : record array
  585. The data of the arff file, accessible by attribute names.
  586. meta : `MetaData`
  587. Contains information about the arff file such as name and
  588. type of attributes, the relation (name of the dataset), etc.
  589. Raises
  590. ------
  591. ParseArffError
  592. This is raised if the given file is not ARFF-formatted.
  593. NotImplementedError
  594. The ARFF file has an attribute which is not supported yet.
  595. Notes
  596. -----
  597. This function should be able to read most arff files. Not
  598. implemented functionality include:
  599. * date type attributes
  600. * string type attributes
  601. It can read files with numeric and nominal attributes. It cannot read
  602. files with sparse data ({} in the file). However, this function can
  603. read files with missing data (? in the file), representing the data
  604. points as NaNs.
  605. Examples
  606. --------
  607. >>> from scipy.io import arff
  608. >>> from io import StringIO
  609. >>> content = \"\"\"
  610. ... @relation foo
  611. ... @attribute width numeric
  612. ... @attribute height numeric
  613. ... @attribute color {red,green,blue,yellow,black}
  614. ... @data
  615. ... 5.0,3.25,blue
  616. ... 4.5,3.75,green
  617. ... 3.0,4.00,red
  618. ... \"\"\"
  619. >>> f = StringIO(content)
  620. >>> data, meta = arff.loadarff(f)
  621. >>> data
  622. array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
  623. dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
  624. >>> meta
  625. Dataset: foo
  626. \twidth's type is numeric
  627. \theight's type is numeric
  628. \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
  629. """
  630. if hasattr(f, 'read'):
  631. ofile = f
  632. else:
  633. ofile = open(f, 'rt')
  634. try:
  635. return _loadarff(ofile)
  636. finally:
  637. if ofile is not f: # only close what we opened
  638. ofile.close()
  639. def _loadarff(ofile):
  640. # Parse the header file
  641. try:
  642. rel, attr = read_header(ofile)
  643. except ValueError as e:
  644. msg = "Error while parsing header, error was: " + str(e)
  645. raise ParseArffError(msg) from e
  646. # Check whether we have a string attribute (not supported yet)
  647. hasstr = False
  648. for a in attr:
  649. if isinstance(a, StringAttribute):
  650. hasstr = True
  651. meta = MetaData(rel, attr)
  652. # XXX The following code is not great
  653. # Build the type descriptor descr and the list of convertors to convert
  654. # each attribute to the suitable type (which should match the one in
  655. # descr).
  656. # This can be used once we want to support integer as integer values and
  657. # not as numeric anymore (using masked arrays ?).
  658. if hasstr:
  659. # How to support string efficiently ? Ideally, we should know the max
  660. # size of the string before allocating the numpy array.
  661. raise NotImplementedError("String attributes not supported yet, sorry")
  662. ni = len(attr)
  663. def generator(row_iter, delim=','):
  664. # TODO: this is where we are spending time (~80%). I think things
  665. # could be made more efficiently:
  666. # - We could for example "compile" the function, because some values
  667. # do not change here.
  668. # - The function to convert a line to dtyped values could also be
  669. # generated on the fly from a string and be executed instead of
  670. # looping.
  671. # - The regex are overkill: for comments, checking that a line starts
  672. # by % should be enough and faster, and for empty lines, same thing
  673. # --> this does not seem to change anything.
  674. # 'compiling' the range since it does not change
  675. # Note, I have already tried zipping the converters and
  676. # row elements and got slightly worse performance.
  677. elems = list(range(ni))
  678. dialect = None
  679. for raw in row_iter:
  680. # We do not abstract skipping comments and empty lines for
  681. # performance reasons.
  682. if r_comment.match(raw) or r_empty.match(raw):
  683. continue
  684. row, dialect = split_data_line(raw, dialect)
  685. yield tuple([attr[i].parse_data(row[i]) for i in elems])
  686. a = list(generator(ofile))
  687. # No error should happen here: it is a bug otherwise
  688. data = np.array(a, [(a.name, a.dtype) for a in attr])
  689. return data, meta
  690. # ----
  691. # Misc
  692. # ----
  693. def basic_stats(data):
  694. nbfac = data.size * 1. / (data.size - 1)
  695. return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
  696. def print_attribute(name, tp, data):
  697. type = tp.type_name
  698. if type == 'numeric' or type == 'real' or type == 'integer':
  699. min, max, mean, std = basic_stats(data)
  700. print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
  701. else:
  702. print(str(tp))
  703. def test_weka(filename):
  704. data, meta = loadarff(filename)
  705. print(len(data.dtype))
  706. print(data.size)
  707. for i in meta:
  708. print_attribute(i, meta[i], data[i])
  709. # make sure nose does not find this as a test
  710. test_weka.__test__ = False
  711. if __name__ == '__main__':
  712. import sys
  713. filename = sys.argv[1]
  714. test_weka(filename)