utils: Handle paths with non-UTF-8 encodings

- path_to_uri() encodes unicode input as UTF-8 and leaves bytestring input
  unchanged before it is converted to file:// URIs.

- uri_to_path() will now always return bytestrings, since we don't know if
  there is any non-UTF-8 encoded chars in the file path, and converting it to
  unicode would make such paths no longer match the dir or file it was
  referring to.

- split_path() will now assume it gets a bytestring in.
This commit is contained in:
Stein Magnus Jodal 2012-12-03 15:03:46 +01:00
parent f302949ea9
commit b5d9dc10a7
2 changed files with 56 additions and 11 deletions

View File

@ -51,19 +51,40 @@ def get_or_create_file(filename):
def path_to_uri(*paths):
"""
Convert OS specific path to file:// URI.
Accepts either unicode strings or bytestrings. The encoding of any
bytestring will be maintained so that :func:`uri_to_path` can return the
same bytestring.
Returns a file:// URI as an unicode string.
"""
path = os.path.join(*paths)
path = path.encode('utf-8')
if isinstance(path, unicode):
path = path.encode('utf-8')
if sys.platform == 'win32':
return 'file:' + urllib.pathname2url(path)
return 'file://' + urllib.pathname2url(path)
def uri_to_path(uri):
"""
Convert the file:// to a OS specific path.
Returns a bytestring, since the file path can contain chars with other
encoding than UTF-8.
If we had returned these paths as unicode strings, you wouldn't be able to
look up the matching dir or file on your file system because the exact path
would be lost by ignoring its encoding.
"""
if isinstance(uri, unicode):
uri = uri.encode('utf-8')
if sys.platform == 'win32':
path = urllib.url2pathname(re.sub('^file:', '', uri))
return urllib.url2pathname(re.sub(b'^file:', b'', uri))
else:
path = urllib.url2pathname(re.sub('^file://', '', uri))
return path.encode('latin1').decode('utf-8') # Undo double encoding
return urllib.url2pathname(re.sub(b'^file://', b'', uri))
def split_path(path):
@ -72,7 +93,7 @@ def split_path(path):
path, part = os.path.split(path)
if part:
parts.insert(0, part)
if not path or path == '/':
if not path or path == b'/':
break
return parts

View File

@ -90,31 +90,55 @@ class PathToFileURITest(unittest.TestCase):
result = path.path_to_uri('/tmp/æøå')
self.assertEqual(result, 'file:///tmp/%C3%A6%C3%B8%C3%A5')
def test_utf8_in_path(self):
if sys.platform == 'win32':
result = path.path_to_uri('C:/æøå'.encode('utf-8'))
self.assertEqual(result, 'file:///C://%C3%A6%C3%B8%C3%A5')
else:
result = path.path_to_uri('/tmp/æøå'.encode('utf-8'))
self.assertEqual(result, 'file:///tmp/%C3%A6%C3%B8%C3%A5')
def test_latin1_in_path(self):
if sys.platform == 'win32':
result = path.path_to_uri('C:/æøå'.encode('latin-1'))
self.assertEqual(result, 'file:///C://%E6%F8%E5')
else:
result = path.path_to_uri('/tmp/æøå'.encode('latin-1'))
self.assertEqual(result, 'file:///tmp/%E6%F8%E5')
class UriToPathTest(unittest.TestCase):
def test_simple_uri(self):
if sys.platform == 'win32':
result = path.uri_to_path('file:///C://WINDOWS/clock.avi')
self.assertEqual(result, 'C:/WINDOWS/clock.avi')
self.assertEqual(result, 'C:/WINDOWS/clock.avi'.encode('utf-8'))
else:
result = path.uri_to_path('file:///etc/fstab')
self.assertEqual(result, '/etc/fstab')
self.assertEqual(result, '/etc/fstab'.encode('utf-8'))
def test_space_in_uri(self):
if sys.platform == 'win32':
result = path.uri_to_path('file:///C://test%20this')
self.assertEqual(result, 'C:/test this')
self.assertEqual(result, 'C:/test this'.encode('utf-8'))
else:
result = path.uri_to_path('file:///tmp/test%20this')
self.assertEqual(result, '/tmp/test this')
self.assertEqual(result, '/tmp/test this'.encode('utf-8'))
def test_unicode_in_uri(self):
if sys.platform == 'win32':
result = path.uri_to_path('file:///C://%C3%A6%C3%B8%C3%A5')
self.assertEqual(result, 'C:/æøå')
self.assertEqual(result, 'C:/æøå'.encode('utf-8'))
else:
result = path.uri_to_path('file:///tmp/%C3%A6%C3%B8%C3%A5')
self.assertEqual(result, '/tmp/æøå')
self.assertEqual(result, '/tmp/æøå'.encode('utf-8'))
def test_latin1_in_uri(self):
if sys.platform == 'win32':
result = path.uri_to_path('file:///C://%E6%F8%E5')
self.assertEqual(result, 'C:/æøå'.encode('latin-1'))
else:
result = path.uri_to_path('file:///tmp/%E6%F8%E5')
self.assertEqual(result, '/tmp/æøå'.encode('latin-1'))
class SplitPathTest(unittest.TestCase):