123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588 |
- # Copyright 2009 Canonical Ltd. All rights reserved.
- #
- # This file is part of lazr.uri
- #
- # lazr.uri is free software: you can redistribute it and/or modify it
- # under the terms of the GNU Lesser General Public License as published by
- # the Free Software Foundation, version 3 of the License.
- #
- # lazr.uri is distributed in the hope that it will be useful, but WITHOUT
- # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
- # License for more details.
- #
- # You should have received a copy of the GNU Lesser General Public License
- # along with lazr.uri. If not, see <http://www.gnu.org/licenses/>.
- """Functions for working with generic syntax URIs."""
- __metaclass__ = type
- __all__ = [
- 'URI',
- 'InvalidURIError',
- 'find_uris_in_text',
- 'possible_uri_re',
- 'merge',
- 'remove_dot_segments',
- ]
- import re
- try:
- unicode
- except NameError:
- unicode = str
- # Default port numbers for different URI schemes
- # The registered URI schemes comes from
- # http://www.iana.org/assignments/uri-schemes.html
- # The default ports come from the relevant RFCs
- _default_port = {
- # Official schemes
- 'acap': '674',
- 'dav': '80',
- 'dict': '2628',
- 'dns': '53',
- 'ftp': '21',
- 'go': '1096',
- 'gopher': '70',
- 'h323': '1720',
- 'http': '80',
- 'https': '443',
- 'imap': '143',
- 'ipp': '631',
- 'iris.beep': '702',
- 'ldap': '389',
- 'mtqp': '1038',
- 'mupdate': '3905',
- 'nfs': '2049',
- 'nntp': '119',
- 'pop': '110',
- 'rtsp': '554',
- 'sip': '5060',
- 'sips': '5061',
- 'snmp': '161',
- 'soap.beep': '605',
- 'soap.beeps': '605',
- 'telnet': '23',
- 'tftp': '69',
- 'tip': '3372',
- 'vemmi': '575',
- 'xmlrpc.beep': '602',
- 'xmlrpc.beeps': '602',
- 'z39.50r': '210',
- 'z39.50s': '210',
- # Historical schemes
- 'prospero': '1525',
- 'wais': '210',
- # Common but unregistered schemes
- 'bzr+http': '80',
- 'bzr+ssh': '22',
- 'irc': '6667',
- 'sftp': '22',
- 'ssh': '22',
- 'svn': '3690',
- 'svn+ssh': '22',
- }
- # Regular expressions adapted from the ABNF in the RFC
- scheme_re = r"(?P<scheme>[a-z][-a-z0-9+.]*)"
- userinfo_re = r"(?P<userinfo>(?:[-a-z0-9._~!$&\'()*+,;=:]|%[0-9a-f]{2})*)"
- # The following regular expression will match some IP address style
- # host names that the RFC would not (e.g. leading zeros on the
- # components), but is signficantly simpler.
- host_re = (r"(?P<host>[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|"
- r"(?:[-a-z0-9._~!$&\'()*+,;=]|%[0-9a-f]{2})*|"
- r"\[[0-9a-z:.]+\])")
- port_re = r"(?P<port>[0-9]*)"
- authority_re = r"(?P<authority>(?:%s@)?%s(?::%s)?)" % (
- userinfo_re, host_re, port_re)
- path_abempty_re = r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*"
- path_noscheme_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=@]|%[0-9a-f]{2})+"
- r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")
- path_rootless_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})+"
- r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")
- path_absolute_re = r"/(?:%s)?" % path_rootless_re
- path_empty_re = r""
- hier_part_re = r"(?P<hierpart>//%s%s|%s|%s|%s)" % (
- authority_re, path_abempty_re, path_absolute_re, path_rootless_re,
- path_empty_re)
- relative_part_re = r"(?P<relativepart>//%s%s|%s|%s|%s)" % (
- authority_re, path_abempty_re, path_absolute_re, path_noscheme_re,
- path_empty_re)
- # Additionally we also permit square braces in the query portion to
- # accomodate real-world URIs.
- query_re = r"(?P<query>(?:[-a-z0-9._~!$&\'()*+,;=:@/?\[\]]|%[0-9a-f]{2})*)"
- fragment_re = r"(?P<fragment>(?:[-a-z0-9._~!$&\'()*+,;=:@/?]|%[0-9a-f]{2})*)"
- uri_re = r"%s:%s(?:\?%s)?(?:#%s)?$" % (
- scheme_re, hier_part_re, query_re, fragment_re)
- relative_ref_re = r"%s(?:\?%s)?(?:#%s)?$" % (
- relative_part_re, query_re, fragment_re)
- uri_pat = re.compile(uri_re, re.IGNORECASE)
- relative_ref_pat = re.compile(relative_ref_re, re.IGNORECASE)
- def merge(basepath, relpath, has_authority):
- """Merge two URI path components into a single path component.
- Follows rules specified in Section 5.2.3 of RFC 3986.
- The algorithm in the RFC treats the empty basepath edge case
- differently for URIs with and without an authority section, which
- is why the third argument is necessary.
- """
- if has_authority and basepath == '':
- return '/' + relpath
- slash = basepath.rfind('/')
- return basepath[:slash+1] + relpath
- def remove_dot_segments(path):
- """Remove '.' and '..' segments from a URI path.
- Follows the rules specified in Section 5.2.4 of RFC 3986.
- """
- output = []
- while path:
- if path.startswith('../'):
- path = path[3:]
- elif path.startswith('./'):
- path = path[2:]
- elif path.startswith('/./') or path == '/.':
- path = '/' + path[3:]
- elif path.startswith('/../') or path == '/..':
- path = '/' + path[4:]
- if len(output) > 0:
- del output[-1]
- elif path in ['.', '..']:
- path = ''
- else:
- if path.startswith('/'):
- slash = path.find('/', 1)
- else:
- slash = path.find('/')
- if slash < 0:
- slash = len(path)
- output.append(path[:slash])
- path = path[slash:]
- return ''.join(output)
- def normalise_unreserved(string):
- """Return a version of 's' where no unreserved characters are encoded.
- Unreserved characters are defined in Section 2.3 of RFC 3986.
- Percent encoded sequences are normalised to upper case.
- """
- result = string.split('%')
- unreserved = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- 'abcdefghijklmnopqrstuvwxyz'
- '0123456789-._~')
- for index, item in enumerate(result):
- if index == 0:
- continue
- try:
- ch = int(item[:2], 16)
- except ValueError:
- continue
- if chr(ch) in unreserved:
- result[index] = chr(ch) + item[2:]
- else:
- result[index] = '%%%02X%s' % (ch, item[2:])
- return ''.join(result)
- class InvalidURIError(Exception):
- """Invalid URI"""
- class URI:
- """A class that represents a URI.
- This class can represent arbitrary URIs that conform to the
- generic syntax described in RFC 3986.
- """
- def __init__(self, uri=None, scheme=None, userinfo=None, host=None,
- port=None, path=None, query=None, fragment=None):
- """Create a URI instance.
- Can be called with either a string URI or the component parts
- of the URI as keyword arguments.
- In either case, all arguments are expected to be appropriately
- URI encoded.
- """
- assert (uri is not None and scheme is None and userinfo is None and
- host is None and port is None and path is None and
- query is None and fragment is None) or uri is None, (
- "URI() must be called with a single string argument or "
- "with URI components given as keyword arguments.")
- if uri is not None:
- if isinstance(uri, unicode):
- try:
- uri.encode('ASCII')
- except UnicodeEncodeError:
- raise InvalidURIError(
- 'URIs must consist of ASCII characters')
- match = uri_pat.match(uri)
- if match is None:
- raise InvalidURIError('"%s" is not a valid URI' % uri)
- self.scheme = match.group('scheme')
- self.userinfo = match.group('userinfo')
- self.host = match.group('host')
- self.port = match.group('port')
- hierpart = match.group('hierpart')
- authority = match.group('authority')
- if authority is None:
- self.path = hierpart
- else:
- # Skip past the //authority part
- self.path = hierpart[2+len(authority):]
- self.query = match.group('query')
- self.fragment = match.group('fragment')
- else:
- if scheme is None:
- raise InvalidURIError('URIs must have a scheme')
- if host is None and (userinfo is not None or port is not None):
- raise InvalidURIError(
- 'host must be given if userinfo or port are')
- if path is None:
- raise InvalidURIError('URIs must have a path')
- self.scheme = scheme
- self.userinfo = userinfo
- self.host = host
- self.port = port
- self.path = path
- self.query = query
- self.fragment = fragment
- self._normalise()
- if (self.scheme in ['http', 'https', 'ftp', 'gopher', 'telnet',
- 'imap', 'mms', 'rtsp', 'svn', 'svn+ssh',
- 'bzr', 'bzr+http', 'bzr+ssh'] and
- not self.host):
- raise InvalidURIError('%s URIs must have a host name' %
- self.scheme)
- def _normalise(self):
- """Perform normalisation of URI components."""
- self.scheme = self.scheme.lower()
- if self.userinfo is not None:
- self.userinfo = normalise_unreserved(self.userinfo)
- if self.host is not None:
- self.host = normalise_unreserved(self.host.lower())
- if self.port == '':
- self.port = None
- elif self.port is not None:
- if self.port == _default_port.get(self.scheme):
- self.port = None
- if self.host is not None and self.path == '':
- self.path = '/'
- self.path = normalise_unreserved(remove_dot_segments(self.path))
- if self.query is not None:
- self.query = normalise_unreserved(self.query)
- if self.fragment is not None:
- self.fragment = normalise_unreserved(self.fragment)
- @property
- def authority(self):
- """The authority part of the URI"""
- if self.host is None:
- return None
- authority = self.host
- if self.userinfo is not None:
- authority = '%s@%s' % (self.userinfo, authority)
- if self.port is not None:
- authority = '%s:%s' % (authority, self.port)
- return authority
- @property
- def hier_part(self):
- """The hierarchical part of the URI"""
- authority = self.authority
- if authority is None:
- return self.path
- else:
- return '//%s%s' % (authority, self.path)
- def __str__(self):
- uri = '%s:%s' % (self.scheme, self.hier_part)
- if self.query is not None:
- uri += '?%s' % self.query
- if self.fragment is not None:
- uri += '#%s' % self.fragment
- return uri
- def __repr__(self):
- return '%s(%r)' % (self.__class__.__name__, str(self))
- def __eq__(self, other):
- if isinstance(other, self.__class__):
- return (self.scheme == other.scheme and
- self.authority == other.authority and
- self.path == other.path and
- self.query == other.query and
- self.fragment == other.fragment)
- else:
- return NotImplemented
- def __ne__(self, other):
- equal = self.__eq__(other)
- if equal == NotImplemented:
- return NotImplemented
- else:
- return not equal
- def __hash__(self):
- return hash((
- self.scheme, self.authority, self.path, self.query, self.fragment))
- def replace(self, **parts):
- """Replace one or more parts of the URI, returning the result."""
- if not parts:
- return self
- baseparts = dict(
- scheme=self.scheme,
- userinfo=self.userinfo,
- host=self.host,
- port=self.port,
- path=self.path,
- query=self.query,
- fragment=self.fragment)
- baseparts.update(parts)
- return self.__class__(**baseparts)
- def resolve(self, reference):
- """Resolve the given URI reference relative to this URI.
- Uses the rules from Section 5.2 of RFC 3986 to resolve the new
- URI.
- """
- # If the reference is a full URI, then return it as is.
- try:
- return self.__class__(reference)
- except InvalidURIError:
- pass
- match = relative_ref_pat.match(reference)
- if match is None:
- raise InvalidURIError("Invalid relative reference")
- parts = dict(scheme=self.scheme)
- authority = match.group('authority')
- if authority is not None:
- parts['userinfo'] = match.group('userinfo')
- parts['host'] = match.group('host')
- parts['port'] = match.group('port')
- # Skip over the //authority part
- parts['path'] = remove_dot_segments(
- match.group('relativepart')[2+len(authority):])
- parts['query'] = match.group('query')
- else:
- path = match.group('relativepart')
- query = match.group('query')
- if path == '':
- parts['path'] = self.path
- if query is not None:
- parts['query'] = query
- else:
- parts['query'] = self.query
- else:
- if path.startswith('/'):
- parts['path'] = remove_dot_segments(path)
- else:
- parts['path'] = merge(self.path, path,
- has_authority=self.host is not None)
- parts['path'] = remove_dot_segments(parts['path'])
- parts['query'] = query
- parts['userinfo'] = self.userinfo
- parts['host'] = self.host
- parts['port'] = self.port
- parts['fragment'] = match.group('fragment')
- return self.__class__(**parts)
- def append(self, path):
- """Append the given path to this URI.
- The path must not start with a slash, but a slash is added to
- base URI (before appending the path), in case it doesn't end
- with a slash.
- """
- assert not path.startswith('/')
- return self.ensureSlash().resolve(path)
- def contains(self, other):
- """Returns True if the URI 'other' is contained by this one."""
- if (self.scheme != other.scheme or
- self.authority != other.authority):
- return False
- if self.path == other.path:
- return True
- basepath = self.path
- if not basepath.endswith('/'):
- basepath += '/'
- otherpath = other.path
- if not otherpath.endswith('/'):
- otherpath += '/'
- return otherpath.startswith(basepath)
- def underDomain(self, domain):
- """Return True if the given domain name a parent of the URL's host."""
- if len(domain) == 0:
- return True
- our_segments = self.host.split('.')
- domain_segments = domain.split('.')
- return our_segments[-len(domain_segments):] == domain_segments
- def ensureSlash(self):
- """Return a URI with the path normalised to end with a slash."""
- if self.path.endswith('/'):
- return self
- else:
- return self.replace(path=self.path + '/')
- def ensureNoSlash(self):
- """Return a URI with the path normalised to not end with a slash."""
- if self.path.endswith('/'):
- return self.replace(path=self.path.rstrip('/'))
- else:
- return self
- # Regular expression for finding URIs in a body of text:
- #
- # From RFC 3986 ABNF for URIs:
- #
- # URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
- # hier-part = "//" authority path-abempty
- # / path-absolute
- # / path-rootless
- # / path-empty
- #
- # authority = [ userinfo "@" ] host [ ":" port ]
- # userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
- # host = IP-literal / IPv4address / reg-name
- # reg-name = *( unreserved / pct-encoded / sub-delims )
- # port = *DIGIT
- #
- # path-abempty = *( "/" segment )
- # path-absolute = "/" [ segment-nz *( "/" segment ) ]
- # path-rootless = segment-nz *( "/" segment )
- # path-empty = 0<pchar>
- #
- # segment = *pchar
- # segment-nz = 1*pchar
- # pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
- #
- # query = *( pchar / "/" / "?" )
- # fragment = *( pchar / "/" / "?" )
- #
- # unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
- # pct-encoded = "%" HEXDIG HEXDIG
- # sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
- # / "*" / "+" / "," / ";" / "="
- #
- # We only match a set of known scheme names. We don't handle
- # IP-literal either.
- #
- # We will simplify "unreserved / pct-encoded / sub-delims" as the
- # following regular expression:
- # [-a-zA-Z0-9._~%!$&'()*+,;=]
- #
- # We also require that the path-rootless form not begin with a
- # colon to avoid matching strings like "http::foo" (to avoid bug
- # #40255).
- #
- # The path-empty pattern is not matched either, due to false
- # positives.
- #
- # Some allowed URI punctuation characters will be trimmed if they
- # appear at the end of the URI since they may be incidental in the
- # flow of the text.
- #
- # apport has at one time produced query strings containing sqaure
- # braces (that are not percent-encoded). In RFC 2986 they seem to be
- # allowed by section 2.2 "Reserved Characters", yet section 3.4
- # "Query" appears to provide a strict definition of the query string
- # that would forbid square braces. Either way, links with
- # non-percent-encoded square braces are being used on Launchpad so
- # it's probably best to accomodate them.
- possible_uri_re = r'''
- \b
- (?:about|gopher|http|https|sftp|news|ftp|mailto|file|irc|jabber|xmpp)
- :
- (?:
- (?:
- # "//" authority path-abempty
- //
- (?: # userinfo
- [%(unreserved)s:]*
- @
- )?
- (?: # host
- \d+\.\d+\.\d+\.\d+ |
- [%(unreserved)s]*
- )
- (?: # port
- : \d*
- )?
- (?: / [%(unreserved)s:@]* )*
- ) | (?:
- # path-absolute
- /
- (?: [%(unreserved)s:@]+
- (?: / [%(unreserved)s:@]* )* )?
- ) | (?:
- # path-rootless
- [%(unreserved)s@]
- [%(unreserved)s:@]*
- (?: / [%(unreserved)s:@]* )*
- )
- )
- (?: # query
- \?
- [%(unreserved)s:@/\?\[\]]*
- )?
- (?: # fragment
- \#
- [%(unreserved)s:@/\?]*
- )?
- ''' % {'unreserved': "-a-zA-Z0-9._~%!$&'()*+,;="}
- possible_uri_pat = re.compile(possible_uri_re, re.IGNORECASE | re.VERBOSE)
- uri_trailers_pat = re.compile(r'([,.?:);>]+)$')
- def find_uris_in_text(text):
- """Scan a block of text for URIs, and yield the ones found."""
- for match in possible_uri_pat.finditer(text):
- uri_string = match.group()
- # remove characters from end of URI that are not likely to be
- # part of the URI.
- uri_string = uri_trailers_pat.sub('', uri_string)
- try:
- uri = URI(uri_string)
- except InvalidURIError:
- continue
- yield uri
|