Merge pull request #566 from adamcik/feature/scanner

Switch back to custom scanner
This commit is contained in:
Stein Magnus Jodal 2013-11-06 12:31:02 -08:00
commit 71eff1bf45
4 changed files with 183 additions and 110 deletions

149
mopidy/audio/scan.py Normal file
View File

@ -0,0 +1,149 @@
from __future__ import unicode_literals
import pygst
pygst.require('0.10')
import gst
import datetime
import os
import time
from mopidy import exceptions
from mopidy.models import Track, Artist, Album
from mopidy.utils import path
class Scanner(object):
def __init__(self, timeout=1000, min_duration=100):
self.timeout_ms = timeout
self.min_duration_ms = min_duration
sink = gst.element_factory_make('fakesink')
audio_caps = gst.Caps(b'audio/x-raw-int; audio/x-raw-float')
pad_added = lambda src, pad: pad.link(sink.get_pad('sink'))
self.uribin = gst.element_factory_make('uridecodebin')
self.uribin.set_property('caps', audio_caps)
self.uribin.connect('pad-added', pad_added)
self.pipe = gst.element_factory_make('pipeline')
self.pipe.add(self.uribin)
self.pipe.add(sink)
self.bus = self.pipe.get_bus()
self.bus.set_flushing(True)
def scan(self, uri):
try:
self._setup(uri)
data = self._collect()
# Make sure uri and duration does not come from tags.
data[b'uri'] = uri
data[b'mtime'] = self._query_mtime(uri)
data[gst.TAG_DURATION] = self._query_duration()
finally:
self._reset()
if data[gst.TAG_DURATION] < self.min_duration_ms * gst.MSECOND:
raise exceptions.ScannerError('Rejecting file with less than %dms '
'audio data.' % self.min_duration_ms)
return data
def _setup(self, uri):
"""Primes the pipeline for collection."""
self.pipe.set_state(gst.STATE_READY)
self.uribin.set_property(b'uri', uri)
self.bus.set_flushing(False)
self.pipe.set_state(gst.STATE_PAUSED)
def _collect(self):
"""Polls for messages to collect data."""
start = time.time()
timeout_s = self.timeout_ms / float(1000)
poll_timeout_ns = 1000
data = {}
while time.time() - start < timeout_s:
message = self.bus.poll(gst.MESSAGE_ANY, poll_timeout_ns)
if message is None:
pass # polling the bus timed out.
elif message.type == gst.MESSAGE_ERROR:
raise exceptions.ScannerError(message.parse_error()[0])
elif message.type == gst.MESSAGE_EOS:
return data
elif message.type == gst.MESSAGE_ASYNC_DONE:
if message.src == self.pipe:
return data
elif message.type == gst.MESSAGE_TAG:
taglist = message.parse_tag()
for key in taglist.keys():
data[key] = taglist[key]
raise exceptions.ScannerError('Timeout after %dms' % self.timeout_ms)
def _reset(self):
"""Ensures we cleanup child elements and flush the bus."""
self.bus.set_flushing(True)
self.pipe.set_state(gst.STATE_NULL)
def _query_duration(self):
try:
return self.pipe.query_duration(gst.FORMAT_TIME, None)[0]
except gst.QueryError:
return None
def _query_mtime(self, uri):
if not uri.startswith('file:'):
return None
return os.path.getmtime(path.uri_to_path(uri))
def audio_data_to_track(data):
"""Convert taglist data + our extras to a track."""
albumartist_kwargs = {}
album_kwargs = {}
artist_kwargs = {}
track_kwargs = {}
def _retrieve(source_key, target_key, target):
if source_key in data:
target[target_key] = data[source_key]
_retrieve(gst.TAG_ALBUM, 'name', album_kwargs)
_retrieve(gst.TAG_TRACK_COUNT, 'num_tracks', album_kwargs)
_retrieve(gst.TAG_ALBUM_VOLUME_COUNT, 'num_discs', album_kwargs)
_retrieve(gst.TAG_ARTIST, 'name', artist_kwargs)
if gst.TAG_DATE in data and data[gst.TAG_DATE]:
date = data[gst.TAG_DATE]
try:
date = datetime.date(date.year, date.month, date.day)
except ValueError:
pass # Ignore invalid dates
else:
track_kwargs['date'] = date.isoformat()
_retrieve(gst.TAG_TITLE, 'name', track_kwargs)
_retrieve(gst.TAG_TRACK_NUMBER, 'track_no', track_kwargs)
_retrieve(gst.TAG_ALBUM_VOLUME_NUMBER, 'disc_no', track_kwargs)
# Following keys don't seem to have TAG_* constant.
_retrieve('album-artist', 'name', albumartist_kwargs)
_retrieve('musicbrainz-trackid', 'musicbrainz_id', track_kwargs)
_retrieve('musicbrainz-artistid', 'musicbrainz_id', artist_kwargs)
_retrieve('musicbrainz-albumid', 'musicbrainz_id', album_kwargs)
_retrieve(
'musicbrainz-albumartistid', 'musicbrainz_id', albumartist_kwargs)
if albumartist_kwargs:
album_kwargs['artists'] = [Artist(**albumartist_kwargs)]
track_kwargs['uri'] = data['uri']
track_kwargs['last_modified'] = int(data['mtime'])
track_kwargs['length'] = data[gst.TAG_DURATION] // gst.MSECOND
track_kwargs['album'] = Album(**album_kwargs)
track_kwargs['artists'] = [Artist(**artist_kwargs)]
return Track(**track_kwargs)

View File

@ -301,6 +301,7 @@ def _add_to_tag_cache(result, dirs, files, media_dir):
relative_path = os.path.relpath(path, base_path)
relative_uri = urllib.quote(relative_path)
# TODO: use track.last_modified
track_result['file'] = relative_uri
track_result['mtime'] = get_mtime(path)
track_result['key'] = os.path.basename(text_path)

View File

@ -1,28 +1,21 @@
from __future__ import unicode_literals
import argparse
import datetime
import logging
import os
import sys
import time
import gobject
gobject.threads_init()
# Extract any command line arguments. This needs to be done before GStreamer is
# imported, so that GStreamer doesn't hijack e.g. ``--help``.
mopidy_args = sys.argv[1:]
sys.argv[1:] = []
import pygst
pygst.require('0.10')
import gst
import gst.pbutils
from mopidy import config as config_lib, exceptions, ext
from mopidy.models import Track, Artist, Album
from mopidy.audio import scan
from mopidy.utils import log, path, versioning
@ -80,11 +73,13 @@ def main():
logging.info('Checking tracks from library.')
for track in local_updater.load():
try:
# TODO: convert local to file uri / path
stat = os.stat(path.uri_to_path(track.uri))
if int(stat.st_mtime) > track.last_modified:
uris_update.add(track.uri)
uris_library.add(track.uri)
except OSError:
logging.debug('Missing file %s', track.uri)
uris_remove.add(track.uri)
logging.info('Removing %d moved or deleted tracks.', len(uris_remove))
@ -103,21 +98,39 @@ def main():
logging.info('Found %d new or modified tracks.', len(uris_update))
logging.info('Scanning new and modified tracks.')
scanner = Scanner(config['local']['scan_timeout'])
for uri in uris_update:
scanner = scan.Scanner(config['local']['scan_timeout'])
progress = Progress(len(uris_update))
for uri in sorted(uris_update):
try:
data = scanner.scan(uri)
data[b'mtime'] = os.path.getmtime(path.uri_to_path(uri))
track = translator(data)
track = scan.audio_data_to_track(data)
local_updater.add(track)
logging.debug('Added %s', track.uri)
except exceptions.ScannerError as error:
logging.warning('Failed %s: %s', uri, error)
logging.info('Done scanning; commiting changes.')
progress.increment()
logging.info('Commiting changes.')
local_updater.commit()
class Progress(object):
def __init__(self, total):
self.count = 0
self.total = total
self.start = time.time()
def increment(self):
self.count += 1
if self.count % 1000 == 0 or self.count == self.total:
duration = time.time() - self.start
remainder = duration / self.count * (self.total - self.count)
logging.info('Scanned %d of %d files in %ds, ~%ds left.',
self.count, self.total, duration, remainder)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
@ -134,95 +147,5 @@ def parse_args():
return parser.parse_args(args=mopidy_args)
# TODO: move into scanner.
def translator(data):
albumartist_kwargs = {}
album_kwargs = {}
artist_kwargs = {}
track_kwargs = {}
def _retrieve(source_key, target_key, target):
if source_key in data:
target[target_key] = data[source_key]
_retrieve(gst.TAG_ALBUM, 'name', album_kwargs)
_retrieve(gst.TAG_TRACK_COUNT, 'num_tracks', album_kwargs)
_retrieve(gst.TAG_ALBUM_VOLUME_COUNT, 'num_discs', album_kwargs)
_retrieve(gst.TAG_ARTIST, 'name', artist_kwargs)
if gst.TAG_DATE in data and data[gst.TAG_DATE]:
date = data[gst.TAG_DATE]
try:
date = datetime.date(date.year, date.month, date.day)
except ValueError:
pass # Ignore invalid dates
else:
track_kwargs['date'] = date.isoformat()
_retrieve(gst.TAG_TITLE, 'name', track_kwargs)
_retrieve(gst.TAG_TRACK_NUMBER, 'track_no', track_kwargs)
_retrieve(gst.TAG_ALBUM_VOLUME_NUMBER, 'disc_no', track_kwargs)
# Following keys don't seem to have TAG_* constant.
_retrieve('album-artist', 'name', albumartist_kwargs)
_retrieve('musicbrainz-trackid', 'musicbrainz_id', track_kwargs)
_retrieve('musicbrainz-artistid', 'musicbrainz_id', artist_kwargs)
_retrieve('musicbrainz-albumid', 'musicbrainz_id', album_kwargs)
_retrieve(
'musicbrainz-albumartistid', 'musicbrainz_id', albumartist_kwargs)
if albumartist_kwargs:
album_kwargs['artists'] = [Artist(**albumartist_kwargs)]
track_kwargs['uri'] = data['uri']
track_kwargs['last_modified'] = int(data['mtime'])
track_kwargs['length'] = data[gst.TAG_DURATION]
track_kwargs['album'] = Album(**album_kwargs)
track_kwargs['artists'] = [Artist(**artist_kwargs)]
return Track(**track_kwargs)
class Scanner(object):
def __init__(self, timeout=1000):
self.discoverer = gst.pbutils.Discoverer(timeout * 1000000)
def scan(self, uri):
try:
info = self.discoverer.discover_uri(uri)
except gobject.GError as e:
# Loosing traceback is non-issue since this is from C code.
raise exceptions.ScannerError(e)
data = {}
audio_streams = info.get_audio_streams()
if not audio_streams:
raise exceptions.ScannerError('Did not find any audio streams.')
for stream in audio_streams:
taglist = stream.get_tags()
if not taglist:
continue
for key in taglist.keys():
# XXX: For some crazy reason some wma files spit out lists
# here, not sure if this is due to better data in headers or
# wma being stupid. So ugly hack for now :/
if type(taglist[key]) is list:
data[key] = taglist[key][0]
else:
data[key] = taglist[key]
# Never trust metadata for these fields:
data[b'uri'] = uri
data[b'duration'] = info.get_duration() // gst.MSECOND
if data[b'duration'] < 100:
raise exceptions.ScannerError(
'Rejecting file with less than 100ms audio data.')
return data
if __name__ == '__main__':
main()

View File

@ -3,8 +3,8 @@ from __future__ import unicode_literals
import unittest
from mopidy import exceptions
from mopidy.audio import scan
from mopidy.models import Track, Artist, Album
from mopidy.scanner import Scanner, translator
from mopidy.utils import path as path_lib
from tests import path_to_data_dir
@ -31,7 +31,7 @@ class TranslatorTest(unittest.TestCase):
'album-disc-count': 3,
'date': FakeGstDate(2006, 1, 1,),
'container-format': 'ID3 tag',
'duration': 4531,
'duration': 4531000000,
'musicbrainz-trackid': 'mbtrackid',
'musicbrainz-albumid': 'mbalbumid',
'musicbrainz-artistid': 'mbartistid',
@ -76,7 +76,7 @@ class TranslatorTest(unittest.TestCase):
def check(self):
expected = self.build_track()
actual = translator(self.data)
actual = scan.audio_data_to_track(self.data)
self.assertEqual(expected, actual)
def test_basic_data(self):
@ -151,7 +151,7 @@ class ScannerTest(unittest.TestCase):
def scan(self, path):
paths = path_lib.find_files(path_to_data_dir(path))
uris = (path_lib.path_to_uri(p) for p in paths)
scanner = Scanner()
scanner = scan.Scanner()
for uri in uris:
key = uri[len('file://'):]
try:
@ -182,8 +182,8 @@ class ScannerTest(unittest.TestCase):
def test_duration_is_set(self):
self.scan('scanner/simple')
self.check('scanner/simple/song1.mp3', 'duration', 4680)
self.check('scanner/simple/song1.ogg', 'duration', 4680)
self.check('scanner/simple/song1.mp3', 'duration', 4680000000)
self.check('scanner/simple/song1.ogg', 'duration', 4680000000)
def test_artist_is_set(self):
self.scan('scanner/simple')