123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905 |
- # Last Change: Mon Aug 20 08:00 PM 2007 J
- import re
- import datetime
- import numpy as np
- import csv
- import ctypes
- """A module to read arff files."""
- __all__ = ['MetaData', 'loadarff', 'ArffError', 'ParseArffError']
- # An Arff file is basically two parts:
- # - header
- # - data
- #
- # A header has each of its components starting by @META where META is one of
- # the keyword (attribute of relation, for now).
- # TODO:
- # - both integer and reals are treated as numeric -> the integer info
- # is lost!
- # - Replace ValueError by ParseError or something
- # We know can handle the following:
- # - numeric and nominal attributes
- # - missing values for numeric attributes
- r_meta = re.compile(r'^\s*@')
- # Match a comment
- r_comment = re.compile(r'^%')
- # Match an empty line
- r_empty = re.compile(r'^\s+$')
- # Match a header line, that is a line which starts by @ + a word
- r_headerline = re.compile(r'^\s*@\S*')
- r_datameta = re.compile(r'^@[Dd][Aa][Tt][Aa]')
- r_relation = re.compile(r'^@[Rr][Ee][Ll][Aa][Tt][Ii][Oo][Nn]\s*(\S*)')
- r_attribute = re.compile(r'^\s*@[Aa][Tt][Tt][Rr][Ii][Bb][Uu][Tt][Ee]\s*(..*$)')
- r_nominal = re.compile(r'{(.+)}')
- r_date = re.compile(r"[Dd][Aa][Tt][Ee]\s+[\"']?(.+?)[\"']?$")
- # To get attributes name enclosed with ''
- r_comattrval = re.compile(r"'(..+)'\s+(..+$)")
- # To get normal attributes
- r_wcomattrval = re.compile(r"(\S+)\s+(..+$)")
- # ------------------------
- # Module defined exception
- # ------------------------
- class ArffError(OSError):
- pass
- class ParseArffError(ArffError):
- pass
- # ----------
- # Attributes
- # ----------
- class Attribute:
- type_name = None
- def __init__(self, name):
- self.name = name
- self.range = None
- self.dtype = np.object_
- @classmethod
- def parse_attribute(cls, name, attr_string):
- """
- Parse the attribute line if it knows how. Returns the parsed
- attribute, or None.
- """
- return None
- def parse_data(self, data_str):
- """
- Parse a value of this type.
- """
- return None
- def __str__(self):
- """
- Parse a value of this type.
- """
- return self.name + ',' + self.type_name
- class NominalAttribute(Attribute):
- type_name = 'nominal'
- def __init__(self, name, values):
- super().__init__(name)
- self.values = values
- self.range = values
- self.dtype = (np.string_, max(len(i) for i in values))
- @staticmethod
- def _get_nom_val(atrv):
- """Given a string containing a nominal type, returns a tuple of the
- possible values.
- A nominal type is defined as something framed between braces ({}).
- Parameters
- ----------
- atrv : str
- Nominal type definition
- Returns
- -------
- poss_vals : tuple
- possible values
- Examples
- --------
- >>> get_nom_val("{floup, bouga, fl, ratata}")
- ('floup', 'bouga', 'fl', 'ratata')
- """
- m = r_nominal.match(atrv)
- if m:
- attrs, _ = split_data_line(m.group(1))
- return tuple(attrs)
- else:
- raise ValueError("This does not look like a nominal string")
- @classmethod
- def parse_attribute(cls, name, attr_string):
- """
- Parse the attribute line if it knows how. Returns the parsed
- attribute, or None.
- For nominal attributes, the attribute string would be like '{<attr_1>,
- <attr2>, <attr_3>}'.
- """
- if attr_string[0] == '{':
- values = cls._get_nom_val(attr_string)
- return cls(name, values)
- else:
- return None
- def parse_data(self, data_str):
- """
- Parse a value of this type.
- """
- if data_str in self.values:
- return data_str
- elif data_str == '?':
- return data_str
- else:
- raise ValueError("%s value not in %s" % (str(data_str),
- str(self.values)))
- def __str__(self):
- msg = self.name + ",{"
- for i in range(len(self.values)-1):
- msg += self.values[i] + ","
- msg += self.values[-1]
- msg += "}"
- return msg
- class NumericAttribute(Attribute):
- def __init__(self, name):
- super().__init__(name)
- self.type_name = 'numeric'
- self.dtype = np.float_
- @classmethod
- def parse_attribute(cls, name, attr_string):
- """
- Parse the attribute line if it knows how. Returns the parsed
- attribute, or None.
- For numeric attributes, the attribute string would be like
- 'numeric' or 'int' or 'real'.
- """
- attr_string = attr_string.lower().strip()
- if (attr_string[:len('numeric')] == 'numeric' or
- attr_string[:len('int')] == 'int' or
- attr_string[:len('real')] == 'real'):
- return cls(name)
- else:
- return None
- def parse_data(self, data_str):
- """
- Parse a value of this type.
- Parameters
- ----------
- data_str : str
- string to convert
- Returns
- -------
- f : float
- where float can be nan
- Examples
- --------
- >>> atr = NumericAttribute('atr')
- >>> atr.parse_data('1')
- 1.0
- >>> atr.parse_data('1\\n')
- 1.0
- >>> atr.parse_data('?\\n')
- nan
- """
- if '?' in data_str:
- return np.nan
- else:
- return float(data_str)
- def _basic_stats(self, data):
- nbfac = data.size * 1. / (data.size - 1)
- return (np.nanmin(data), np.nanmax(data),
- np.mean(data), np.std(data) * nbfac)
- class StringAttribute(Attribute):
- def __init__(self, name):
- super().__init__(name)
- self.type_name = 'string'
- @classmethod
- def parse_attribute(cls, name, attr_string):
- """
- Parse the attribute line if it knows how. Returns the parsed
- attribute, or None.
- For string attributes, the attribute string would be like
- 'string'.
- """
- attr_string = attr_string.lower().strip()
- if attr_string[:len('string')] == 'string':
- return cls(name)
- else:
- return None
- class DateAttribute(Attribute):
- def __init__(self, name, date_format, datetime_unit):
- super().__init__(name)
- self.date_format = date_format
- self.datetime_unit = datetime_unit
- self.type_name = 'date'
- self.range = date_format
- self.dtype = np.datetime64(0, self.datetime_unit)
- @staticmethod
- def _get_date_format(atrv):
- m = r_date.match(atrv)
- if m:
- pattern = m.group(1).strip()
- # convert time pattern from Java's SimpleDateFormat to C's format
- datetime_unit = None
- if "yyyy" in pattern:
- pattern = pattern.replace("yyyy", "%Y")
- datetime_unit = "Y"
- elif "yy":
- pattern = pattern.replace("yy", "%y")
- datetime_unit = "Y"
- if "MM" in pattern:
- pattern = pattern.replace("MM", "%m")
- datetime_unit = "M"
- if "dd" in pattern:
- pattern = pattern.replace("dd", "%d")
- datetime_unit = "D"
- if "HH" in pattern:
- pattern = pattern.replace("HH", "%H")
- datetime_unit = "h"
- if "mm" in pattern:
- pattern = pattern.replace("mm", "%M")
- datetime_unit = "m"
- if "ss" in pattern:
- pattern = pattern.replace("ss", "%S")
- datetime_unit = "s"
- if "z" in pattern or "Z" in pattern:
- raise ValueError("Date type attributes with time zone not "
- "supported, yet")
- if datetime_unit is None:
- raise ValueError("Invalid or unsupported date format")
- return pattern, datetime_unit
- else:
- raise ValueError("Invalid or no date format")
- @classmethod
- def parse_attribute(cls, name, attr_string):
- """
- Parse the attribute line if it knows how. Returns the parsed
- attribute, or None.
- For date attributes, the attribute string would be like
- 'date <format>'.
- """
- attr_string_lower = attr_string.lower().strip()
- if attr_string_lower[:len('date')] == 'date':
- date_format, datetime_unit = cls._get_date_format(attr_string)
- return cls(name, date_format, datetime_unit)
- else:
- return None
- def parse_data(self, data_str):
- """
- Parse a value of this type.
- """
- date_str = data_str.strip().strip("'").strip('"')
- if date_str == '?':
- return np.datetime64('NaT', self.datetime_unit)
- else:
- dt = datetime.datetime.strptime(date_str, self.date_format)
- return np.datetime64(dt).astype(
- "datetime64[%s]" % self.datetime_unit)
- def __str__(self):
- return super().__str__() + ',' + self.date_format
- class RelationalAttribute(Attribute):
- def __init__(self, name):
- super().__init__(name)
- self.type_name = 'relational'
- self.dtype = np.object_
- self.attributes = []
- self.dialect = None
- @classmethod
- def parse_attribute(cls, name, attr_string):
- """
- Parse the attribute line if it knows how. Returns the parsed
- attribute, or None.
- For date attributes, the attribute string would be like
- 'date <format>'.
- """
- attr_string_lower = attr_string.lower().strip()
- if attr_string_lower[:len('relational')] == 'relational':
- return cls(name)
- else:
- return None
- def parse_data(self, data_str):
- # Copy-pasted
- elems = list(range(len(self.attributes)))
- escaped_string = data_str.encode().decode("unicode-escape")
- row_tuples = []
- for raw in escaped_string.split("\n"):
- row, self.dialect = split_data_line(raw, self.dialect)
- row_tuples.append(tuple(
- [self.attributes[i].parse_data(row[i]) for i in elems]))
- return np.array(row_tuples,
- [(a.name, a.dtype) for a in self.attributes])
- def __str__(self):
- return (super().__str__() + '\n\t' +
- '\n\t'.join(str(a) for a in self.attributes))
- # -----------------
- # Various utilities
- # -----------------
- def to_attribute(name, attr_string):
- attr_classes = (NominalAttribute, NumericAttribute, DateAttribute,
- StringAttribute, RelationalAttribute)
- for cls in attr_classes:
- attr = cls.parse_attribute(name, attr_string)
- if attr is not None:
- return attr
- raise ParseArffError("unknown attribute %s" % attr_string)
- def csv_sniffer_has_bug_last_field():
- """
- Checks if the bug https://bugs.python.org/issue30157 is unpatched.
- """
- # We only compute this once.
- has_bug = getattr(csv_sniffer_has_bug_last_field, "has_bug", None)
- if has_bug is None:
- dialect = csv.Sniffer().sniff("3, 'a'")
- csv_sniffer_has_bug_last_field.has_bug = dialect.quotechar != "'"
- has_bug = csv_sniffer_has_bug_last_field.has_bug
- return has_bug
- def workaround_csv_sniffer_bug_last_field(sniff_line, dialect, delimiters):
- """
- Workaround for the bug https://bugs.python.org/issue30157 if is unpatched.
- """
- if csv_sniffer_has_bug_last_field():
- # Reuses code from the csv module
- right_regex = r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'
- for restr in (r'(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
- r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # .*?",
- right_regex, # ,".*?"
- r'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
- regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
- matches = regexp.findall(sniff_line)
- if matches:
- break
- # If it does not match the expression that was bugged, then this bug does not apply
- if restr != right_regex:
- return
- groupindex = regexp.groupindex
- # There is only one end of the string
- assert len(matches) == 1
- m = matches[0]
- n = groupindex['quote'] - 1
- quote = m[n]
- n = groupindex['delim'] - 1
- delim = m[n]
- n = groupindex['space'] - 1
- space = bool(m[n])
- dq_regexp = re.compile(
- r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" %
- {'delim': re.escape(delim), 'quote': quote}, re.MULTILINE
- )
- doublequote = bool(dq_regexp.search(sniff_line))
- dialect.quotechar = quote
- if delim in delimiters:
- dialect.delimiter = delim
- dialect.doublequote = doublequote
- dialect.skipinitialspace = space
- def split_data_line(line, dialect=None):
- delimiters = ",\t"
- # This can not be done in a per reader basis, and relational fields
- # can be HUGE
- csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
- # Remove the line end if any
- if line[-1] == '\n':
- line = line[:-1]
-
- # Remove potential trailing whitespace
- line = line.strip()
-
- sniff_line = line
- # Add a delimiter if none is present, so that the csv.Sniffer
- # does not complain for a single-field CSV.
- if not any(d in line for d in delimiters):
- sniff_line += ","
- if dialect is None:
- dialect = csv.Sniffer().sniff(sniff_line, delimiters=delimiters)
- workaround_csv_sniffer_bug_last_field(sniff_line=sniff_line,
- dialect=dialect,
- delimiters=delimiters)
- row = next(csv.reader([line], dialect))
- return row, dialect
- # --------------
- # Parsing header
- # --------------
- def tokenize_attribute(iterable, attribute):
- """Parse a raw string in header (e.g., starts by @attribute).
- Given a raw string attribute, try to get the name and type of the
- attribute. Constraints:
- * The first line must start with @attribute (case insensitive, and
- space like characters before @attribute are allowed)
- * Works also if the attribute is spread on multilines.
- * Works if empty lines or comments are in between
- Parameters
- ----------
- attribute : str
- the attribute string.
- Returns
- -------
- name : str
- name of the attribute
- value : str
- value of the attribute
- next : str
- next line to be parsed
- Examples
- --------
- If attribute is a string defined in python as r"floupi real", will
- return floupi as name, and real as value.
- >>> iterable = iter([0] * 10) # dummy iterator
- >>> tokenize_attribute(iterable, r"@attribute floupi real")
- ('floupi', 'real', 0)
- If attribute is r"'floupi 2' real", will return 'floupi 2' as name,
- and real as value.
- >>> tokenize_attribute(iterable, r" @attribute 'floupi 2' real ")
- ('floupi 2', 'real', 0)
- """
- sattr = attribute.strip()
- mattr = r_attribute.match(sattr)
- if mattr:
- # atrv is everything after @attribute
- atrv = mattr.group(1)
- if r_comattrval.match(atrv):
- name, type = tokenize_single_comma(atrv)
- next_item = next(iterable)
- elif r_wcomattrval.match(atrv):
- name, type = tokenize_single_wcomma(atrv)
- next_item = next(iterable)
- else:
- # Not sure we should support this, as it does not seem supported by
- # weka.
- raise ValueError("multi line not supported yet")
- else:
- raise ValueError("First line unparsable: %s" % sattr)
- attribute = to_attribute(name, type)
- if type.lower() == 'relational':
- next_item = read_relational_attribute(iterable, attribute, next_item)
- # raise ValueError("relational attributes not supported yet")
- return attribute, next_item
- def tokenize_single_comma(val):
- # XXX we match twice the same string (here and at the caller level). It is
- # stupid, but it is easier for now...
- m = r_comattrval.match(val)
- if m:
- try:
- name = m.group(1).strip()
- type = m.group(2).strip()
- except IndexError as e:
- raise ValueError("Error while tokenizing attribute") from e
- else:
- raise ValueError("Error while tokenizing single %s" % val)
- return name, type
- def tokenize_single_wcomma(val):
- # XXX we match twice the same string (here and at the caller level). It is
- # stupid, but it is easier for now...
- m = r_wcomattrval.match(val)
- if m:
- try:
- name = m.group(1).strip()
- type = m.group(2).strip()
- except IndexError as e:
- raise ValueError("Error while tokenizing attribute") from e
- else:
- raise ValueError("Error while tokenizing single %s" % val)
- return name, type
- def read_relational_attribute(ofile, relational_attribute, i):
- """Read the nested attributes of a relational attribute"""
- r_end_relational = re.compile(r'^@[Ee][Nn][Dd]\s*' +
- relational_attribute.name + r'\s*$')
- while not r_end_relational.match(i):
- m = r_headerline.match(i)
- if m:
- isattr = r_attribute.match(i)
- if isattr:
- attr, i = tokenize_attribute(ofile, i)
- relational_attribute.attributes.append(attr)
- else:
- raise ValueError("Error parsing line %s" % i)
- else:
- i = next(ofile)
- i = next(ofile)
- return i
- def read_header(ofile):
- """Read the header of the iterable ofile."""
- i = next(ofile)
- # Pass first comments
- while r_comment.match(i):
- i = next(ofile)
- # Header is everything up to DATA attribute ?
- relation = None
- attributes = []
- while not r_datameta.match(i):
- m = r_headerline.match(i)
- if m:
- isattr = r_attribute.match(i)
- if isattr:
- attr, i = tokenize_attribute(ofile, i)
- attributes.append(attr)
- else:
- isrel = r_relation.match(i)
- if isrel:
- relation = isrel.group(1)
- else:
- raise ValueError("Error parsing line %s" % i)
- i = next(ofile)
- else:
- i = next(ofile)
- return relation, attributes
- class MetaData:
- """Small container to keep useful information on a ARFF dataset.
- Knows about attributes names and types.
- Examples
- --------
- ::
- data, meta = loadarff('iris.arff')
- # This will print the attributes names of the iris.arff dataset
- for i in meta:
- print(i)
- # This works too
- meta.names()
- # Getting attribute type
- types = meta.types()
- Methods
- -------
- names
- types
- Notes
- -----
- Also maintains the list of attributes in order, i.e., doing for i in
- meta, where meta is an instance of MetaData, will return the
- different attribute names in the order they were defined.
- """
- def __init__(self, rel, attr):
- self.name = rel
- self._attributes = {a.name: a for a in attr}
- def __repr__(self):
- msg = ""
- msg += "Dataset: %s\n" % self.name
- for i in self._attributes:
- msg += "\t%s's type is %s" % (i, self._attributes[i].type_name)
- if self._attributes[i].range:
- msg += ", range is %s" % str(self._attributes[i].range)
- msg += '\n'
- return msg
- def __iter__(self):
- return iter(self._attributes)
- def __getitem__(self, key):
- attr = self._attributes[key]
- return (attr.type_name, attr.range)
- def names(self):
- """Return the list of attribute names.
- Returns
- -------
- attrnames : list of str
- The attribute names.
- """
- return list(self._attributes)
- def types(self):
- """Return the list of attribute types.
- Returns
- -------
- attr_types : list of str
- The attribute types.
- """
- attr_types = [self._attributes[name].type_name
- for name in self._attributes]
- return attr_types
- def loadarff(f):
- """
- Read an arff file.
- The data is returned as a record array, which can be accessed much like
- a dictionary of NumPy arrays. For example, if one of the attributes is
- called 'pressure', then its first 10 data points can be accessed from the
- ``data`` record array like so: ``data['pressure'][0:10]``
- Parameters
- ----------
- f : file-like or str
- File-like object to read from, or filename to open.
- Returns
- -------
- data : record array
- The data of the arff file, accessible by attribute names.
- meta : `MetaData`
- Contains information about the arff file such as name and
- type of attributes, the relation (name of the dataset), etc.
- Raises
- ------
- ParseArffError
- This is raised if the given file is not ARFF-formatted.
- NotImplementedError
- The ARFF file has an attribute which is not supported yet.
- Notes
- -----
- This function should be able to read most arff files. Not
- implemented functionality include:
- * date type attributes
- * string type attributes
- It can read files with numeric and nominal attributes. It cannot read
- files with sparse data ({} in the file). However, this function can
- read files with missing data (? in the file), representing the data
- points as NaNs.
- Examples
- --------
- >>> from scipy.io import arff
- >>> from io import StringIO
- >>> content = \"\"\"
- ... @relation foo
- ... @attribute width numeric
- ... @attribute height numeric
- ... @attribute color {red,green,blue,yellow,black}
- ... @data
- ... 5.0,3.25,blue
- ... 4.5,3.75,green
- ... 3.0,4.00,red
- ... \"\"\"
- >>> f = StringIO(content)
- >>> data, meta = arff.loadarff(f)
- >>> data
- array([(5.0, 3.25, 'blue'), (4.5, 3.75, 'green'), (3.0, 4.0, 'red')],
- dtype=[('width', '<f8'), ('height', '<f8'), ('color', '|S6')])
- >>> meta
- Dataset: foo
- \twidth's type is numeric
- \theight's type is numeric
- \tcolor's type is nominal, range is ('red', 'green', 'blue', 'yellow', 'black')
- """
- if hasattr(f, 'read'):
- ofile = f
- else:
- ofile = open(f, 'rt')
- try:
- return _loadarff(ofile)
- finally:
- if ofile is not f: # only close what we opened
- ofile.close()
- def _loadarff(ofile):
- # Parse the header file
- try:
- rel, attr = read_header(ofile)
- except ValueError as e:
- msg = "Error while parsing header, error was: " + str(e)
- raise ParseArffError(msg) from e
- # Check whether we have a string attribute (not supported yet)
- hasstr = False
- for a in attr:
- if isinstance(a, StringAttribute):
- hasstr = True
- meta = MetaData(rel, attr)
- # XXX The following code is not great
- # Build the type descriptor descr and the list of convertors to convert
- # each attribute to the suitable type (which should match the one in
- # descr).
- # This can be used once we want to support integer as integer values and
- # not as numeric anymore (using masked arrays ?).
- if hasstr:
- # How to support string efficiently ? Ideally, we should know the max
- # size of the string before allocating the numpy array.
- raise NotImplementedError("String attributes not supported yet, sorry")
- ni = len(attr)
- def generator(row_iter, delim=','):
- # TODO: this is where we are spending time (~80%). I think things
- # could be made more efficiently:
- # - We could for example "compile" the function, because some values
- # do not change here.
- # - The function to convert a line to dtyped values could also be
- # generated on the fly from a string and be executed instead of
- # looping.
- # - The regex are overkill: for comments, checking that a line starts
- # by % should be enough and faster, and for empty lines, same thing
- # --> this does not seem to change anything.
- # 'compiling' the range since it does not change
- # Note, I have already tried zipping the converters and
- # row elements and got slightly worse performance.
- elems = list(range(ni))
- dialect = None
- for raw in row_iter:
- # We do not abstract skipping comments and empty lines for
- # performance reasons.
- if r_comment.match(raw) or r_empty.match(raw):
- continue
- row, dialect = split_data_line(raw, dialect)
- yield tuple([attr[i].parse_data(row[i]) for i in elems])
- a = list(generator(ofile))
- # No error should happen here: it is a bug otherwise
- data = np.array(a, [(a.name, a.dtype) for a in attr])
- return data, meta
- # ----
- # Misc
- # ----
- def basic_stats(data):
- nbfac = data.size * 1. / (data.size - 1)
- return np.nanmin(data), np.nanmax(data), np.mean(data), np.std(data) * nbfac
- def print_attribute(name, tp, data):
- type = tp.type_name
- if type == 'numeric' or type == 'real' or type == 'integer':
- min, max, mean, std = basic_stats(data)
- print("%s,%s,%f,%f,%f,%f" % (name, type, min, max, mean, std))
- else:
- print(str(tp))
- def test_weka(filename):
- data, meta = loadarff(filename)
- print(len(data.dtype))
- print(data.size)
- for i in meta:
- print_attribute(i, meta[i], data[i])
- # make sure nose does not find this as a test
- test_weka.__test__ = False
- if __name__ == '__main__':
- import sys
- filename = sys.argv[1]
- test_weka(filename)
|