Merge pull request #566 from adamcik/feature/scanner

Switch back to custom scanner
2013-11-06 12:31:02 -08:00 · 2013-11-06 12:31:02 -08:00 · 71eff1bf45
commit 71eff1bf45
parent 0ab1aacbc5 86926e8011
4 changed files with 183 additions and 110 deletions
--- a/mopidy/audio/scan.py
+++ b/mopidy/audio/scan.py
@ -0,0 +1,149 @@
+from __future__ import unicode_literals
+
+import pygst
+pygst.require('0.10')
+import gst
+
+import datetime
+import os
+import time
+
+from mopidy import exceptions
+from mopidy.models import Track, Artist, Album
+from mopidy.utils import path
+
+
+class Scanner(object):
+    def __init__(self, timeout=1000, min_duration=100):
+        self.timeout_ms = timeout
+        self.min_duration_ms = min_duration
+
+        sink = gst.element_factory_make('fakesink')
+
+        audio_caps = gst.Caps(b'audio/x-raw-int; audio/x-raw-float')
+        pad_added = lambda src, pad: pad.link(sink.get_pad('sink'))
+
+        self.uribin = gst.element_factory_make('uridecodebin')
+        self.uribin.set_property('caps', audio_caps)
+        self.uribin.connect('pad-added', pad_added)
+
+        self.pipe = gst.element_factory_make('pipeline')
+        self.pipe.add(self.uribin)
+        self.pipe.add(sink)
+
+        self.bus = self.pipe.get_bus()
+        self.bus.set_flushing(True)
+
+    def scan(self, uri):
+        try:
+            self._setup(uri)
+            data = self._collect()
+            # Make sure uri and duration does not come from tags.
+            data[b'uri'] = uri
+            data[b'mtime'] = self._query_mtime(uri)
+            data[gst.TAG_DURATION] = self._query_duration()
+        finally:
+            self._reset()
+
+        if data[gst.TAG_DURATION] < self.min_duration_ms * gst.MSECOND:
+            raise exceptions.ScannerError('Rejecting file with less than %dms '
+                                          'audio data.' % self.min_duration_ms)
+        return data
+
+    def _setup(self, uri):
+        """Primes the pipeline for collection."""
+        self.pipe.set_state(gst.STATE_READY)
+        self.uribin.set_property(b'uri', uri)
+        self.bus.set_flushing(False)
+        self.pipe.set_state(gst.STATE_PAUSED)
+
+    def _collect(self):
+        """Polls for messages to collect data."""
+        start = time.time()
+        timeout_s = self.timeout_ms / float(1000)
+        poll_timeout_ns = 1000
+        data = {}
+
+        while time.time() - start < timeout_s:
+            message = self.bus.poll(gst.MESSAGE_ANY, poll_timeout_ns)
+
+            if message is None:
+                pass  # polling the bus timed out.
+            elif message.type == gst.MESSAGE_ERROR:
+                raise exceptions.ScannerError(message.parse_error()[0])
+            elif message.type == gst.MESSAGE_EOS:
+                return data
+            elif message.type == gst.MESSAGE_ASYNC_DONE:
+                if message.src == self.pipe:
+                    return data
+            elif message.type == gst.MESSAGE_TAG:
+                taglist = message.parse_tag()
+                for key in taglist.keys():
+                    data[key] = taglist[key]
+
+        raise exceptions.ScannerError('Timeout after %dms' % self.timeout_ms)
+
+    def _reset(self):
+        """Ensures we cleanup child elements and flush the bus."""
+        self.bus.set_flushing(True)
+        self.pipe.set_state(gst.STATE_NULL)
+
+    def _query_duration(self):
+        try:
+            return self.pipe.query_duration(gst.FORMAT_TIME, None)[0]
+        except gst.QueryError:
+            return None
+
+    def _query_mtime(self, uri):
+        if not uri.startswith('file:'):
+            return None
+        return os.path.getmtime(path.uri_to_path(uri))
+
+
+def audio_data_to_track(data):
+    """Convert taglist data + our extras to a track."""
+    albumartist_kwargs = {}
+    album_kwargs = {}
+    artist_kwargs = {}
+    track_kwargs = {}
+
+    def _retrieve(source_key, target_key, target):
+        if source_key in data:
+            target[target_key] = data[source_key]
+
+    _retrieve(gst.TAG_ALBUM, 'name', album_kwargs)
+    _retrieve(gst.TAG_TRACK_COUNT, 'num_tracks', album_kwargs)
+    _retrieve(gst.TAG_ALBUM_VOLUME_COUNT, 'num_discs', album_kwargs)
+    _retrieve(gst.TAG_ARTIST, 'name', artist_kwargs)
+
+    if gst.TAG_DATE in data and data[gst.TAG_DATE]:
+        date = data[gst.TAG_DATE]
+        try:
+            date = datetime.date(date.year, date.month, date.day)
+        except ValueError:
+            pass  # Ignore invalid dates
+        else:
+            track_kwargs['date'] = date.isoformat()
+
+    _retrieve(gst.TAG_TITLE, 'name', track_kwargs)
+    _retrieve(gst.TAG_TRACK_NUMBER, 'track_no', track_kwargs)
+    _retrieve(gst.TAG_ALBUM_VOLUME_NUMBER, 'disc_no', track_kwargs)
+
+    # Following keys don't seem to have TAG_* constant.
+    _retrieve('album-artist', 'name', albumartist_kwargs)
+    _retrieve('musicbrainz-trackid', 'musicbrainz_id', track_kwargs)
+    _retrieve('musicbrainz-artistid', 'musicbrainz_id', artist_kwargs)
+    _retrieve('musicbrainz-albumid', 'musicbrainz_id', album_kwargs)
+    _retrieve(
+        'musicbrainz-albumartistid', 'musicbrainz_id', albumartist_kwargs)
+
+    if albumartist_kwargs:
+        album_kwargs['artists'] = [Artist(**albumartist_kwargs)]
+
+    track_kwargs['uri'] = data['uri']
+    track_kwargs['last_modified'] = int(data['mtime'])
+    track_kwargs['length'] = data[gst.TAG_DURATION] // gst.MSECOND
+    track_kwargs['album'] = Album(**album_kwargs)
+    track_kwargs['artists'] = [Artist(**artist_kwargs)]
+
+    return Track(**track_kwargs)
--- a/mopidy/frontends/mpd/translator.py
+++ b/mopidy/frontends/mpd/translator.py
@ -301,6 +301,7 @@ def _add_to_tag_cache(result, dirs, files, media_dir):
        relative_path = os.path.relpath(path, base_path)
        relative_uri = urllib.quote(relative_path)

+        # TODO: use track.last_modified
        track_result['file'] = relative_uri
        track_result['mtime'] = get_mtime(path)
        track_result['key'] = os.path.basename(text_path)
--- a/mopidy/scanner.py
+++ b/mopidy/scanner.py
@ -1,28 +1,21 @@
 from __future__ import unicode_literals

 import argparse
-import datetime
 import logging
 import os
 import sys
+import time

 import gobject
 gobject.threads_init()

-
 # Extract any command line arguments. This needs to be done before GStreamer is
 # imported, so that GStreamer doesn't hijack e.g. ``--help``.
 mopidy_args = sys.argv[1:]
 sys.argv[1:] = []

-
-import pygst
-pygst.require('0.10')
-import gst
-import gst.pbutils
-
 from mopidy import config as config_lib, exceptions, ext
-from mopidy.models import Track, Artist, Album
+from mopidy.audio import scan
 from mopidy.utils import log, path, versioning


@ -80,11 +73,13 @@ def main():
    logging.info('Checking tracks from library.')
    for track in local_updater.load():
        try:
+            # TODO: convert local to file uri / path
            stat = os.stat(path.uri_to_path(track.uri))
            if int(stat.st_mtime) > track.last_modified:
                uris_update.add(track.uri)
            uris_library.add(track.uri)
        except OSError:
+            logging.debug('Missing file %s', track.uri)
            uris_remove.add(track.uri)

    logging.info('Removing %d moved or deleted tracks.', len(uris_remove))
@ -103,21 +98,39 @@ def main():
    logging.info('Found %d new or modified tracks.', len(uris_update))
    logging.info('Scanning new and modified tracks.')

-    scanner = Scanner(config['local']['scan_timeout'])
-    for uri in uris_update:
+    scanner = scan.Scanner(config['local']['scan_timeout'])
+    progress = Progress(len(uris_update))
+
+    for uri in sorted(uris_update):
        try:
            data = scanner.scan(uri)
-            data[b'mtime'] = os.path.getmtime(path.uri_to_path(uri))
-            track = translator(data)
+            track = scan.audio_data_to_track(data)
            local_updater.add(track)
            logging.debug('Added %s', track.uri)
        except exceptions.ScannerError as error:
            logging.warning('Failed %s: %s', uri, error)

-    logging.info('Done scanning; commiting changes.')
+        progress.increment()
+
+    logging.info('Commiting changes.')
    local_updater.commit()


+class Progress(object):
+    def __init__(self, total):
+        self.count = 0
+        self.total = total
+        self.start = time.time()
+
+    def increment(self):
+        self.count += 1
+        if self.count % 1000 == 0 or self.count == self.total:
+            duration = time.time() - self.start
+            remainder = duration / self.count * (self.total - self.count)
+            logging.info('Scanned %d of %d files in %ds, ~%ds left.',
+                         self.count, self.total, duration, remainder)
+
+
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
@ -134,95 +147,5 @@ def parse_args():
    return parser.parse_args(args=mopidy_args)


-# TODO: move into scanner.
-def translator(data):
-    albumartist_kwargs = {}
-    album_kwargs = {}
-    artist_kwargs = {}
-    track_kwargs = {}
-
-    def _retrieve(source_key, target_key, target):
-        if source_key in data:
-            target[target_key] = data[source_key]
-
-    _retrieve(gst.TAG_ALBUM, 'name', album_kwargs)
-    _retrieve(gst.TAG_TRACK_COUNT, 'num_tracks', album_kwargs)
-    _retrieve(gst.TAG_ALBUM_VOLUME_COUNT, 'num_discs', album_kwargs)
-    _retrieve(gst.TAG_ARTIST, 'name', artist_kwargs)
-
-    if gst.TAG_DATE in data and data[gst.TAG_DATE]:
-        date = data[gst.TAG_DATE]
-        try:
-            date = datetime.date(date.year, date.month, date.day)
-        except ValueError:
-            pass  # Ignore invalid dates
-        else:
-            track_kwargs['date'] = date.isoformat()
-
-    _retrieve(gst.TAG_TITLE, 'name', track_kwargs)
-    _retrieve(gst.TAG_TRACK_NUMBER, 'track_no', track_kwargs)
-    _retrieve(gst.TAG_ALBUM_VOLUME_NUMBER, 'disc_no', track_kwargs)
-
-    # Following keys don't seem to have TAG_* constant.
-    _retrieve('album-artist', 'name', albumartist_kwargs)
-    _retrieve('musicbrainz-trackid', 'musicbrainz_id', track_kwargs)
-    _retrieve('musicbrainz-artistid', 'musicbrainz_id', artist_kwargs)
-    _retrieve('musicbrainz-albumid', 'musicbrainz_id', album_kwargs)
-    _retrieve(
-        'musicbrainz-albumartistid', 'musicbrainz_id', albumartist_kwargs)
-
-    if albumartist_kwargs:
-        album_kwargs['artists'] = [Artist(**albumartist_kwargs)]
-
-    track_kwargs['uri'] = data['uri']
-    track_kwargs['last_modified'] = int(data['mtime'])
-    track_kwargs['length'] = data[gst.TAG_DURATION]
-    track_kwargs['album'] = Album(**album_kwargs)
-    track_kwargs['artists'] = [Artist(**artist_kwargs)]
-
-    return Track(**track_kwargs)
-
-
-class Scanner(object):
-    def __init__(self, timeout=1000):
-        self.discoverer = gst.pbutils.Discoverer(timeout * 1000000)
-
-    def scan(self, uri):
-        try:
-            info = self.discoverer.discover_uri(uri)
-        except gobject.GError as e:
-            # Loosing traceback is non-issue since this is from C code.
-            raise exceptions.ScannerError(e)
-
-        data = {}
-        audio_streams = info.get_audio_streams()
-
-        if not audio_streams:
-            raise exceptions.ScannerError('Did not find any audio streams.')
-
-        for stream in audio_streams:
-            taglist = stream.get_tags()
-            if not taglist:
-                continue
-            for key in taglist.keys():
-                # XXX: For some crazy reason some wma files spit out lists
-                # here, not sure if this is due to better data in headers or
-                # wma being stupid. So ugly hack for now :/
-                if type(taglist[key]) is list:
-                    data[key] = taglist[key][0]
-                else:
-                    data[key] = taglist[key]
-
-        # Never trust metadata for these fields:
-        data[b'uri'] = uri
-        data[b'duration'] = info.get_duration() // gst.MSECOND
-
-        if data[b'duration'] < 100:
-            raise exceptions.ScannerError(
-                'Rejecting file with less than 100ms audio data.')
-
-        return data
-
-
 if __name__ == '__main__':
    main()
--- a/tests/audio/scan_test.py
+++ b/tests/audio/scan_test.py
@ -3,8 +3,8 @@ from __future__ import unicode_literals
 import unittest

 from mopidy import exceptions
+from mopidy.audio import scan
 from mopidy.models import Track, Artist, Album
-from mopidy.scanner import Scanner, translator
 from mopidy.utils import path as path_lib

 from tests import path_to_data_dir
@ -31,7 +31,7 @@ class TranslatorTest(unittest.TestCase):
            'album-disc-count': 3,
            'date': FakeGstDate(2006, 1, 1,),
            'container-format': 'ID3 tag',
-            'duration': 4531,
+            'duration': 4531000000,
            'musicbrainz-trackid': 'mbtrackid',
            'musicbrainz-albumid': 'mbalbumid',
            'musicbrainz-artistid': 'mbartistid',
@ -76,7 +76,7 @@ class TranslatorTest(unittest.TestCase):

    def check(self):
        expected = self.build_track()
-        actual = translator(self.data)
+        actual = scan.audio_data_to_track(self.data)
        self.assertEqual(expected, actual)

    def test_basic_data(self):
@ -151,7 +151,7 @@ class ScannerTest(unittest.TestCase):
    def scan(self, path):
        paths = path_lib.find_files(path_to_data_dir(path))
        uris = (path_lib.path_to_uri(p) for p in paths)
-        scanner = Scanner()
+        scanner = scan.Scanner()
        for uri in uris:
            key = uri[len('file://'):]
            try:
@ -182,8 +182,8 @@ class ScannerTest(unittest.TestCase):

    def test_duration_is_set(self):
        self.scan('scanner/simple')
-        self.check('scanner/simple/song1.mp3', 'duration', 4680)
-        self.check('scanner/simple/song1.ogg', 'duration', 4680)
+        self.check('scanner/simple/song1.mp3', 'duration', 4680000000)
+        self.check('scanner/simple/song1.ogg', 'duration', 4680000000)

    def test_artist_is_set(self):
        self.scan('scanner/simple')