Differences

This shows you the differences between two versions of the page.

random:python_binary_search_file_by_line [2013/12/03 05:12] (current)
grant created
Line 1: Line 1:
 +======Python Binary Search File By Line======
 +Works well for large (10s of GBs) files. I used this to parse and explore the [[http://webdatacommons.org/hyperlinkgraph|Web Data Commons Hyperlink Graph]].
 +
 +<code python>
 +import os
 +
 +def line_binary_search(filename, matchvalue, key=lambda val: val):
 +    """
 +    Binary search a file for matching lines.
 +    Returns a list of matching lines.
 +    filename - path to file, passed to 'open'
 +    matchvalue - value to match
 +    key - function to extract comparison value from line
 +
 +    >>> parser = lambda val: int(val.split('\t')[0].strip())
 +    >>> line_binary_search('sd-arc', 63889187, parser)
 +    ['63889187\t3592559\n', ...]
 +    """
 +
 +    # Must be greater than the maximum length of any line.
 +
 +    max_line_len = 2 ** 12
 +
 +    start = pos = 0
 +    end = os.path.getsize(filename)
 +
 +    with open(filename, 'rb') as fptr:
 +
 +        # Limit the number of times we binary search.
 +
 +        for rpt in xrange(50):
 +
 +            last = pos
 +            pos = start + ((end - start) / 2)
 +            fptr.seek(pos)
 +
 +            # Move the cursor to a newline boundary.
 +
 +            fptr.readline()
 +
 +            line = fptr.readline()
 +            linevalue = key(line)
 +
 +            if linevalue == matchvalue or pos == last:
 +
 +                # Seek back until we no longer have a match.
 +
 +                while True:
 +                    fptr.seek(-max_line_len, 1)
 +                    fptr.readline()
 +                    if matchvalue != key(fptr.readline()):
 +                        break
 +
 +               # Seek forward to the first match.
 +
 +                for rpt in xrange(max_line_len):
 +                    line = fptr.readline()
 +                    linevalue = key(line)
 +                    if matchvalue == linevalue:
 +                        break
 +                else:
 +                    # No match was found.
 +
 +                    return []
 +
 +                results = []
 +
 +                while linevalue == matchvalue:
 +                    results.append(line)
 +                    line = fptr.readline()
 +                    linevalue = key(line)
 +
 +                return results
 +            elif linevalue < matchvalue:
 +                start = fptr.tell()
 +            else:
 +                assert linevalue > matchvalue
 +                end = fptr.tell()
 +        else:
 +            raise RuntimeError('binary search failed')
 +</code>
random/python_binary_search_file_by_line.txt · Last modified: 2013/12/03 05:12 by grant