utils: Handle paths with non-UTF-8 encodings
- path_to_uri() encodes unicode input as UTF-8 and leaves bytestring input unchanged before it is converted to file:// URIs. - uri_to_path() will now always return bytestrings, since we don't know if there is any non-UTF-8 encoded chars in the file path, and converting it to unicode would make such paths no longer match the dir or file it was referring to. - split_path() will now assume it gets a bytestring in.
This commit is contained in:
parent
f302949ea9
commit
b5d9dc10a7
@ -51,19 +51,40 @@ def get_or_create_file(filename):
|
||||
|
||||
|
||||
def path_to_uri(*paths):
|
||||
"""
|
||||
Convert OS specific path to file:// URI.
|
||||
|
||||
Accepts either unicode strings or bytestrings. The encoding of any
|
||||
bytestring will be maintained so that :func:`uri_to_path` can return the
|
||||
same bytestring.
|
||||
|
||||
Returns a file:// URI as an unicode string.
|
||||
"""
|
||||
path = os.path.join(*paths)
|
||||
path = path.encode('utf-8')
|
||||
if isinstance(path, unicode):
|
||||
path = path.encode('utf-8')
|
||||
if sys.platform == 'win32':
|
||||
return 'file:' + urllib.pathname2url(path)
|
||||
return 'file://' + urllib.pathname2url(path)
|
||||
|
||||
|
||||
def uri_to_path(uri):
|
||||
"""
|
||||
Convert the file:// to a OS specific path.
|
||||
|
||||
Returns a bytestring, since the file path can contain chars with other
|
||||
encoding than UTF-8.
|
||||
|
||||
If we had returned these paths as unicode strings, you wouldn't be able to
|
||||
look up the matching dir or file on your file system because the exact path
|
||||
would be lost by ignoring its encoding.
|
||||
"""
|
||||
if isinstance(uri, unicode):
|
||||
uri = uri.encode('utf-8')
|
||||
if sys.platform == 'win32':
|
||||
path = urllib.url2pathname(re.sub('^file:', '', uri))
|
||||
return urllib.url2pathname(re.sub(b'^file:', b'', uri))
|
||||
else:
|
||||
path = urllib.url2pathname(re.sub('^file://', '', uri))
|
||||
return path.encode('latin1').decode('utf-8') # Undo double encoding
|
||||
return urllib.url2pathname(re.sub(b'^file://', b'', uri))
|
||||
|
||||
|
||||
def split_path(path):
|
||||
@ -72,7 +93,7 @@ def split_path(path):
|
||||
path, part = os.path.split(path)
|
||||
if part:
|
||||
parts.insert(0, part)
|
||||
if not path or path == '/':
|
||||
if not path or path == b'/':
|
||||
break
|
||||
return parts
|
||||
|
||||
|
||||
@ -90,31 +90,55 @@ class PathToFileURITest(unittest.TestCase):
|
||||
result = path.path_to_uri('/tmp/æøå')
|
||||
self.assertEqual(result, 'file:///tmp/%C3%A6%C3%B8%C3%A5')
|
||||
|
||||
def test_utf8_in_path(self):
|
||||
if sys.platform == 'win32':
|
||||
result = path.path_to_uri('C:/æøå'.encode('utf-8'))
|
||||
self.assertEqual(result, 'file:///C://%C3%A6%C3%B8%C3%A5')
|
||||
else:
|
||||
result = path.path_to_uri('/tmp/æøå'.encode('utf-8'))
|
||||
self.assertEqual(result, 'file:///tmp/%C3%A6%C3%B8%C3%A5')
|
||||
|
||||
def test_latin1_in_path(self):
|
||||
if sys.platform == 'win32':
|
||||
result = path.path_to_uri('C:/æøå'.encode('latin-1'))
|
||||
self.assertEqual(result, 'file:///C://%E6%F8%E5')
|
||||
else:
|
||||
result = path.path_to_uri('/tmp/æøå'.encode('latin-1'))
|
||||
self.assertEqual(result, 'file:///tmp/%E6%F8%E5')
|
||||
|
||||
|
||||
class UriToPathTest(unittest.TestCase):
|
||||
def test_simple_uri(self):
|
||||
if sys.platform == 'win32':
|
||||
result = path.uri_to_path('file:///C://WINDOWS/clock.avi')
|
||||
self.assertEqual(result, 'C:/WINDOWS/clock.avi')
|
||||
self.assertEqual(result, 'C:/WINDOWS/clock.avi'.encode('utf-8'))
|
||||
else:
|
||||
result = path.uri_to_path('file:///etc/fstab')
|
||||
self.assertEqual(result, '/etc/fstab')
|
||||
self.assertEqual(result, '/etc/fstab'.encode('utf-8'))
|
||||
|
||||
def test_space_in_uri(self):
|
||||
if sys.platform == 'win32':
|
||||
result = path.uri_to_path('file:///C://test%20this')
|
||||
self.assertEqual(result, 'C:/test this')
|
||||
self.assertEqual(result, 'C:/test this'.encode('utf-8'))
|
||||
else:
|
||||
result = path.uri_to_path('file:///tmp/test%20this')
|
||||
self.assertEqual(result, '/tmp/test this')
|
||||
self.assertEqual(result, '/tmp/test this'.encode('utf-8'))
|
||||
|
||||
def test_unicode_in_uri(self):
|
||||
if sys.platform == 'win32':
|
||||
result = path.uri_to_path('file:///C://%C3%A6%C3%B8%C3%A5')
|
||||
self.assertEqual(result, 'C:/æøå')
|
||||
self.assertEqual(result, 'C:/æøå'.encode('utf-8'))
|
||||
else:
|
||||
result = path.uri_to_path('file:///tmp/%C3%A6%C3%B8%C3%A5')
|
||||
self.assertEqual(result, '/tmp/æøå')
|
||||
self.assertEqual(result, '/tmp/æøå'.encode('utf-8'))
|
||||
|
||||
def test_latin1_in_uri(self):
|
||||
if sys.platform == 'win32':
|
||||
result = path.uri_to_path('file:///C://%E6%F8%E5')
|
||||
self.assertEqual(result, 'C:/æøå'.encode('latin-1'))
|
||||
else:
|
||||
result = path.uri_to_path('file:///tmp/%E6%F8%E5')
|
||||
self.assertEqual(result, '/tmp/æøå'.encode('latin-1'))
|
||||
|
||||
|
||||
class SplitPathTest(unittest.TestCase):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user