utils: Update find to be threaded.
This is needed to speedup searches when using network mounted directories where we are heavily IO bound.
This commit is contained in:
parent
590ce19148
commit
4e332da3ed
@ -2,7 +2,10 @@ from __future__ import unicode_literals
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import Queue as queue
|
||||||
|
import stat
|
||||||
import string
|
import string
|
||||||
|
import threading
|
||||||
import urllib
|
import urllib
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
@ -107,6 +110,78 @@ def expand_path(path):
|
|||||||
return path
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _find_worker(relative, hidden, done, work, results, errors):
|
||||||
|
"""Worker thread for collecting stat() results.
|
||||||
|
|
||||||
|
:param str relative: directory to make results relative to
|
||||||
|
:param bool hidden: if entries starting with . should be ignored
|
||||||
|
:param threading.Event done: event indicating that all work has been done
|
||||||
|
:param queue.Queue work: queue of paths to process
|
||||||
|
:param dict results: shared dictionary for storing all the stat() results
|
||||||
|
:param dict errors: shared dictionary for storing any per path errors
|
||||||
|
"""
|
||||||
|
while not done.is_set():
|
||||||
|
try:
|
||||||
|
entry = work.get(block=False)
|
||||||
|
except queue.Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if relative:
|
||||||
|
path = os.path.relpath(entry, relative)
|
||||||
|
else:
|
||||||
|
path = entry
|
||||||
|
|
||||||
|
try:
|
||||||
|
st = os.lstat(entry)
|
||||||
|
if stat.S_ISDIR(st.st_mode):
|
||||||
|
for e in os.listdir(entry):
|
||||||
|
if hidden or not e.startswith(b'.'):
|
||||||
|
work.put(os.path.join(entry, e))
|
||||||
|
elif stat.S_ISREG(st.st_mode):
|
||||||
|
results[path] = st
|
||||||
|
else:
|
||||||
|
errors[path] = 'Not a file or directory'
|
||||||
|
except os.error as e:
|
||||||
|
errors[path] = str(e)
|
||||||
|
finally:
|
||||||
|
work.task_done()
|
||||||
|
|
||||||
|
|
||||||
|
def _find(root, thread_count=10, hidden=True, relative=False):
|
||||||
|
"""Threaded find implementation that provides stat results for files.
|
||||||
|
|
||||||
|
Note that we do _not_ handle loops from bad sym/hardlinks in any way.
|
||||||
|
|
||||||
|
:param str root: root directory to search from, may no be a file
|
||||||
|
:param int thread_count: number of workers to use, mainly useful to
|
||||||
|
mitigate network lag when scanning on NFS etc.
|
||||||
|
:param bool hidden: include files and directory starting with '.'?
|
||||||
|
:param bool relative: if results should be relative to root or absolute
|
||||||
|
"""
|
||||||
|
threads = []
|
||||||
|
results = {}
|
||||||
|
errors = {}
|
||||||
|
done = threading.Event()
|
||||||
|
work = queue.Queue()
|
||||||
|
work.put(os.path.abspath(root))
|
||||||
|
|
||||||
|
if not relative:
|
||||||
|
root = None
|
||||||
|
|
||||||
|
for i in range(thread_count):
|
||||||
|
t = threading.Thread(target=_find_worker,
|
||||||
|
args=(root, hidden, done, work, results, errors))
|
||||||
|
t.daemon = True
|
||||||
|
t.start()
|
||||||
|
threads.append(t)
|
||||||
|
|
||||||
|
work.join()
|
||||||
|
done.set()
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
return results, errors
|
||||||
|
|
||||||
|
|
||||||
def find_files(path):
|
def find_files(path):
|
||||||
"""
|
"""
|
||||||
Finds all files within a path.
|
Finds all files within a path.
|
||||||
@ -119,20 +194,10 @@ def find_files(path):
|
|||||||
path = path.encode('utf-8')
|
path = path.encode('utf-8')
|
||||||
|
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
return
|
return iter([])
|
||||||
|
|
||||||
for dirpath, dirnames, filenames in os.walk(path, followlinks=True):
|
results, errors = _find(path, hidden=False, relative=True)
|
||||||
for dirname in dirnames:
|
return results.iterkeys()
|
||||||
if dirname.startswith(b'.'):
|
|
||||||
# Skip hidden dirs by modifying dirnames inplace
|
|
||||||
dirnames.remove(dirname)
|
|
||||||
|
|
||||||
for filename in filenames:
|
|
||||||
if filename.startswith(b'.'):
|
|
||||||
# Skip hidden files
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield os.path.relpath(os.path.join(dirpath, filename), path)
|
|
||||||
|
|
||||||
|
|
||||||
def check_file_path_is_inside_base_dir(file_path, base_path):
|
def check_file_path_is_inside_base_dir(file_path, base_path):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user