Merge pull request #874 from adamcik/feature/improve-find-and-scan-code

Feature/improve find and scan code
This commit is contained in:
Stein Magnus Jodal 2014-10-20 14:07:46 +02:00
commit c84ed733ee
12 changed files with 203 additions and 45 deletions

View File

@ -86,6 +86,10 @@ See :ref:`config` for general help on configuring Mopidy.
Number of milliseconds before giving up scanning a file and moving on to
the next file.
.. confval:: local/scan_follow_symlinks
If we should follow symlinks found in :confval:`local/media_dir`
.. confval:: local/scan_flush_threshold
Number of tracks to wait before telling library it should try and store

View File

@ -29,6 +29,7 @@ class Extension(ext.Extension):
schema['scan_timeout'] = config.Integer(
minimum=1000, maximum=1000*60*60)
schema['scan_flush_threshold'] = config.Integer(minimum=0)
schema['scan_follow_symlinks'] = config.Boolean()
schema['excluded_file_extensions'] = config.List(optional=True)
return schema

View File

@ -74,9 +74,17 @@ class ScanCommand(commands.Command):
uris_to_update = set()
uris_to_remove = set()
file_mtimes = path.find_mtimes(media_dir)
file_mtimes, file_errors = path.find_mtimes(
media_dir, follow=config['local']['scan_follow_symlinks'])
logger.info('Found %d files in media_dir.', len(file_mtimes))
if file_errors:
logger.warning('Encountered %d errors while scanning media_dir.',
len(file_errors))
for name in file_errors:
logger.debug('Scan error %r for %r', file_errors[name], name)
num_tracks = library.load()
logger.info('Checking %d tracks from library.', num_tracks)
@ -97,11 +105,13 @@ class ScanCommand(commands.Command):
relpath = os.path.relpath(abspath, media_dir)
uri = translator.path_to_local_track_uri(relpath)
if relpath.lower().endswith(excluded_file_extensions):
# TODO: move these to a "predicate" check in the finder?
if b'/.' in relpath:
logger.debug('Skipped %s: Hidden directory/file.', uri)
elif relpath.lower().endswith(excluded_file_extensions):
logger.debug('Skipped %s: File extension excluded.', uri)
continue
uris_to_update.add(uri)
else:
uris_to_update.add(uri)
logger.info(
'Found %d tracks which need to be updated.', len(uris_to_update))

View File

@ -6,6 +6,7 @@ data_dir = $XDG_DATA_DIR/mopidy/local
playlists_dir = $XDG_DATA_DIR/mopidy/local/playlists
scan_timeout = 1000
scan_flush_threshold = 1000
scan_follow_symlinks = false
excluded_file_extensions =
.directory
.html

View File

@ -110,11 +110,11 @@ def expand_path(path):
return path
def _find_worker(relative, hidden, done, work, results, errors):
def _find_worker(relative, follow, done, work, results, errors):
"""Worker thread for collecting stat() results.
:param str relative: directory to make results relative to
:param bool hidden: whether to include files and dirs starting with '.'
:param bool follow: if symlinks should be followed
:param threading.Event done: event indicating that all work has been done
:param queue.Queue work: queue of paths to process
:param dict results: shared dictionary for storing all the stat() results
@ -122,7 +122,7 @@ def _find_worker(relative, hidden, done, work, results, errors):
"""
while not done.is_set():
try:
entry = work.get(block=False)
entry, parents = work.get(block=False)
except queue.Empty:
continue
@ -132,45 +132,57 @@ def _find_worker(relative, hidden, done, work, results, errors):
path = entry
try:
st = os.lstat(entry)
if follow:
st = os.stat(entry)
else:
st = os.lstat(entry)
if (st.st_dev, st.st_ino) in parents:
errors[path] = Exception('Sym/hardlink loop found.')
continue
parents = parents + [(st.st_dev, st.st_ino)]
if stat.S_ISDIR(st.st_mode):
for e in os.listdir(entry):
if hidden or not e.startswith(b'.'):
work.put(os.path.join(entry, e))
work.put((os.path.join(entry, e), parents))
elif stat.S_ISREG(st.st_mode):
results[path] = st
elif stat.S_ISLNK(st.st_mode):
errors[path] = Exception('Not following symlinks.')
else:
errors[path] = 'Not a file or directory'
except os.error as e:
errors[path] = str(e)
errors[path] = Exception('Not a file or directory.')
except OSError as e:
errors[path] = e
finally:
work.task_done()
def _find(root, thread_count=10, hidden=True, relative=False):
def _find(root, thread_count=10, relative=False, follow=False):
"""Threaded find implementation that provides stat results for files.
Note that we do _not_ handle loops from bad sym/hardlinks in any way.
Tries to protect against sym/hardlink loops by keeping an eye on parent
(st_dev, st_ino) pairs.
:param str root: root directory to search from, may not be a file
:param int thread_count: number of workers to use, mainly useful to
mitigate network lag when scanning on NFS etc.
:param bool hidden: whether to include files and dirs starting with '.'
:param bool relative: if results should be relative to root or absolute
:param bool follow: if symlinks should be followed
"""
threads = []
results = {}
errors = {}
done = threading.Event()
work = queue.Queue()
work.put(os.path.abspath(root))
work.put((os.path.abspath(root), []))
if not relative:
root = None
args = (root, follow, done, work, results, errors)
for i in range(thread_count):
t = threading.Thread(target=_find_worker,
args=(root, hidden, done, work, results, errors))
t = threading.Thread(target=_find_worker, args=args)
t.daemon = True
t.start()
threads.append(t)
@ -182,9 +194,10 @@ def _find(root, thread_count=10, hidden=True, relative=False):
return results, errors
def find_mtimes(root):
results, errors = _find(root, hidden=False, relative=False)
return dict((f, int(st.st_mtime)) for f, st in results.iteritems())
def find_mtimes(root, follow=False):
results, errors = _find(root, relative=False, follow=follow)
mtimes = dict((f, int(st.st_mtime)) for f, st in results.iteritems())
return mtimes, errors
def check_file_path_is_inside_base_dir(file_path, base_path):

View File

@ -295,7 +295,8 @@ class ScannerTest(unittest.TestCase):
def find(self, path):
media_dir = path_to_data_dir(path)
for path in path_lib.find_mtimes(media_dir):
result, errors = path_lib.find_mtimes(media_dir)
for path in result:
yield os.path.join(media_dir, path)
def scan(self, paths):

Binary file not shown.

View File

View File

@ -11,7 +11,7 @@ import glib
from mopidy.utils import path
from tests import any_int, path_to_data_dir
import tests
class GetOrCreateDirTest(unittest.TestCase):
@ -214,31 +214,159 @@ class ExpandPathTest(unittest.TestCase):
class FindMTimesTest(unittest.TestCase):
maxDiff = None
def find(self, value):
return path.find_mtimes(path_to_data_dir(value))
def setUp(self):
self.tmpdir = tempfile.mkdtemp(b'.mopidy-tests')
def test_basic_dir(self):
self.assert_(self.find(''))
def tearDown(self):
shutil.rmtree(self.tmpdir, ignore_errors=True)
def test_nonexistant_dir(self):
self.assertEqual(self.find('does-not-exist'), {})
def mkdir(self, *args):
name = os.path.join(self.tmpdir, *[bytes(a) for a in args])
os.mkdir(name)
return name
def test_file(self):
self.assertEqual({path_to_data_dir('blank.mp3'): any_int},
self.find('blank.mp3'))
def test_files(self):
mtimes = self.find('find')
expected_files = [
b'find/foo/bar/file', b'find/foo/file', b'find/baz/file']
expected = {path_to_data_dir(p): any_int for p in expected_files}
self.assertEqual(expected, mtimes)
def touch(self, *args):
name = os.path.join(self.tmpdir, *[bytes(a) for a in args])
open(name, 'w').close()
return name
def test_names_are_bytestrings(self):
is_bytes = lambda f: isinstance(f, bytes)
for name in self.find(''):
self.assert_(
is_bytes(name), '%s is not bytes object' % repr(name))
"""We shouldn't be mixing in unicode for paths."""
result, errors = path.find_mtimes(tests.path_to_data_dir(''))
for name in result.keys() + errors.keys():
self.assertEqual(name, tests.IsA(bytes))
def test_nonexistent_dir(self):
"""Non existent search roots are an error"""
missing = os.path.join(self.tmpdir, 'does-not-exist')
result, errors = path.find_mtimes(missing)
self.assertEqual(result, {})
self.assertEqual(errors, {missing: tests.IsA(OSError)})
def test_empty_dir(self):
"""Empty directories should not show up in results"""
self.mkdir('empty')
result, errors = path.find_mtimes(self.tmpdir)
self.assertEqual(result, {})
self.assertEqual(errors, {})
def test_file_as_the_root(self):
"""Specifying a file as the root should just return the file"""
single = self.touch('single')
result, errors = path.find_mtimes(single)
self.assertEqual(result, {single: tests.any_int})
self.assertEqual(errors, {})
def test_nested_directories(self):
"""Searching nested directories should find all files"""
# Setup foo/bar and baz directories
self.mkdir('foo')
self.mkdir('foo', 'bar')
self.mkdir('baz')
# Touch foo/file foo/bar/file and baz/file
foo_file = self.touch('foo', 'file')
foo_bar_file = self.touch('foo', 'bar', 'file')
baz_file = self.touch('baz', 'file')
result, errors = path.find_mtimes(self.tmpdir)
self.assertEqual(result, {foo_file: tests.any_int,
foo_bar_file: tests.any_int,
baz_file: tests.any_int})
self.assertEqual(errors, {})
def test_missing_permission_to_file(self):
"""Missing permissions to a file is not a search error"""
target = self.touch('no-permission')
os.chmod(target, 0)
result, errors = path.find_mtimes(self.tmpdir)
self.assertEqual({target: tests.any_int}, result)
self.assertEqual({}, errors)
def test_missing_permission_to_directory(self):
"""Missing permissions to a directory is an error"""
directory = self.mkdir('no-permission')
os.chmod(directory, 0)
result, errors = path.find_mtimes(self.tmpdir)
self.assertEqual({}, result)
self.assertEqual({directory: tests.IsA(OSError)}, errors)
def test_symlinks_are_ignored(self):
"""By default symlinks should be treated as an error"""
target = self.touch('target')
link = os.path.join(self.tmpdir, 'link')
os.symlink(target, link)
result, errors = path.find_mtimes(self.tmpdir)
self.assertEqual(result, {target: tests.any_int})
self.assertEqual(errors, {link: tests.IsA(Exception)})
def test_symlink_to_file_as_root_is_followed(self):
"""Passing a symlink as the root should be followed when follow=True"""
target = self.touch('target')
link = os.path.join(self.tmpdir, 'link')
os.symlink(target, link)
result, errors = path.find_mtimes(link, follow=True)
self.assertEqual({link: tests.any_int}, result)
self.assertEqual({}, errors)
def test_symlink_to_directory_is_followed(self):
pass
def test_symlink_pointing_at_itself_fails(self):
"""Symlink pointing at itself should give as an OS error"""
link = os.path.join(self.tmpdir, 'link')
os.symlink(link, link)
result, errors = path.find_mtimes(link, follow=True)
self.assertEqual({}, result)
self.assertEqual({link: tests.IsA(OSError)}, errors)
def test_symlink_pointing_at_parent_fails(self):
"""We should detect a loop via the parent and give up on the branch"""
os.symlink(self.tmpdir, os.path.join(self.tmpdir, 'link'))
result, errors = path.find_mtimes(self.tmpdir, follow=True)
self.assertEqual({}, result)
self.assertEqual(1, len(errors))
self.assertEqual(tests.IsA(Exception), errors.values()[0])
def test_indirect_symlink_loop(self):
"""More indirect loops should also be detected"""
# Setup tmpdir/directory/loop where loop points to tmpdir
directory = os.path.join(self.tmpdir, b'directory')
loop = os.path.join(directory, b'loop')
os.mkdir(directory)
os.symlink(self.tmpdir, loop)
result, errors = path.find_mtimes(self.tmpdir, follow=True)
self.assertEqual({}, result)
self.assertEqual({loop: tests.IsA(Exception)}, errors)
def test_symlink_branches_are_not_excluded(self):
"""Using symlinks to make a file show up multiple times should work"""
self.mkdir('directory')
target = self.touch('directory', 'target')
link1 = os.path.join(self.tmpdir, b'link1')
link2 = os.path.join(self.tmpdir, b'link2')
os.symlink(target, link1)
os.symlink(target, link2)
expected = {target: tests.any_int,
link1: tests.any_int,
link2: tests.any_int}
result, errors = path.find_mtimes(self.tmpdir, follow=True)
self.assertEqual(expected, result)
self.assertEqual({}, errors)
# TODO: kill this in favour of just os.path.getmtime + mocks