From b5d9dc10a70a660184757760fb55223ef2d164ae Mon Sep 17 00:00:00 2001 From: Stein Magnus Jodal Date: Mon, 3 Dec 2012 15:03:46 +0100 Subject: [PATCH] utils: Handle paths with non-UTF-8 encodings - path_to_uri() encodes unicode input as UTF-8 and leaves bytestring input unchanged before it is converted to file:// URIs. - uri_to_path() will now always return bytestrings, since we don't know if there is any non-UTF-8 encoded chars in the file path, and converting it to unicode would make such paths no longer match the dir or file it was referring to. - split_path() will now assume it gets a bytestring in. --- mopidy/utils/path.py | 31 ++++++++++++++++++++++++++----- tests/utils/path_test.py | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 56 insertions(+), 11 deletions(-) diff --git a/mopidy/utils/path.py b/mopidy/utils/path.py index 73063183..eea13fb1 100644 --- a/mopidy/utils/path.py +++ b/mopidy/utils/path.py @@ -51,19 +51,40 @@ def get_or_create_file(filename): def path_to_uri(*paths): + """ + Convert OS specific path to file:// URI. + + Accepts either unicode strings or bytestrings. The encoding of any + bytestring will be maintained so that :func:`uri_to_path` can return the + same bytestring. + + Returns a file:// URI as an unicode string. + """ path = os.path.join(*paths) - path = path.encode('utf-8') + if isinstance(path, unicode): + path = path.encode('utf-8') if sys.platform == 'win32': return 'file:' + urllib.pathname2url(path) return 'file://' + urllib.pathname2url(path) def uri_to_path(uri): + """ + Convert the file:// to a OS specific path. + + Returns a bytestring, since the file path can contain chars with other + encoding than UTF-8. + + If we had returned these paths as unicode strings, you wouldn't be able to + look up the matching dir or file on your file system because the exact path + would be lost by ignoring its encoding. + """ + if isinstance(uri, unicode): + uri = uri.encode('utf-8') if sys.platform == 'win32': - path = urllib.url2pathname(re.sub('^file:', '', uri)) + return urllib.url2pathname(re.sub(b'^file:', b'', uri)) else: - path = urllib.url2pathname(re.sub('^file://', '', uri)) - return path.encode('latin1').decode('utf-8') # Undo double encoding + return urllib.url2pathname(re.sub(b'^file://', b'', uri)) def split_path(path): @@ -72,7 +93,7 @@ def split_path(path): path, part = os.path.split(path) if part: parts.insert(0, part) - if not path or path == '/': + if not path or path == b'/': break return parts diff --git a/tests/utils/path_test.py b/tests/utils/path_test.py index 512a3ba1..cfe58e0a 100644 --- a/tests/utils/path_test.py +++ b/tests/utils/path_test.py @@ -90,31 +90,55 @@ class PathToFileURITest(unittest.TestCase): result = path.path_to_uri('/tmp/æøå') self.assertEqual(result, 'file:///tmp/%C3%A6%C3%B8%C3%A5') + def test_utf8_in_path(self): + if sys.platform == 'win32': + result = path.path_to_uri('C:/æøå'.encode('utf-8')) + self.assertEqual(result, 'file:///C://%C3%A6%C3%B8%C3%A5') + else: + result = path.path_to_uri('/tmp/æøå'.encode('utf-8')) + self.assertEqual(result, 'file:///tmp/%C3%A6%C3%B8%C3%A5') + + def test_latin1_in_path(self): + if sys.platform == 'win32': + result = path.path_to_uri('C:/æøå'.encode('latin-1')) + self.assertEqual(result, 'file:///C://%E6%F8%E5') + else: + result = path.path_to_uri('/tmp/æøå'.encode('latin-1')) + self.assertEqual(result, 'file:///tmp/%E6%F8%E5') + class UriToPathTest(unittest.TestCase): def test_simple_uri(self): if sys.platform == 'win32': result = path.uri_to_path('file:///C://WINDOWS/clock.avi') - self.assertEqual(result, 'C:/WINDOWS/clock.avi') + self.assertEqual(result, 'C:/WINDOWS/clock.avi'.encode('utf-8')) else: result = path.uri_to_path('file:///etc/fstab') - self.assertEqual(result, '/etc/fstab') + self.assertEqual(result, '/etc/fstab'.encode('utf-8')) def test_space_in_uri(self): if sys.platform == 'win32': result = path.uri_to_path('file:///C://test%20this') - self.assertEqual(result, 'C:/test this') + self.assertEqual(result, 'C:/test this'.encode('utf-8')) else: result = path.uri_to_path('file:///tmp/test%20this') - self.assertEqual(result, '/tmp/test this') + self.assertEqual(result, '/tmp/test this'.encode('utf-8')) def test_unicode_in_uri(self): if sys.platform == 'win32': result = path.uri_to_path('file:///C://%C3%A6%C3%B8%C3%A5') - self.assertEqual(result, 'C:/æøå') + self.assertEqual(result, 'C:/æøå'.encode('utf-8')) else: result = path.uri_to_path('file:///tmp/%C3%A6%C3%B8%C3%A5') - self.assertEqual(result, '/tmp/æøå') + self.assertEqual(result, '/tmp/æøå'.encode('utf-8')) + + def test_latin1_in_uri(self): + if sys.platform == 'win32': + result = path.uri_to_path('file:///C://%E6%F8%E5') + self.assertEqual(result, 'C:/æøå'.encode('latin-1')) + else: + result = path.uri_to_path('file:///tmp/%E6%F8%E5') + self.assertEqual(result, '/tmp/æøå'.encode('latin-1')) class SplitPathTest(unittest.TestCase):