Improved BinaryHeap. Added largestPartialIntersection.

This commit is contained in:
Andrei Alexandrescu 2009-04-30 19:26:24 +00:00
parent fbec43b21f
commit 574af5bf3e

View file

@ -1958,7 +1958,7 @@ Tuple!(ElementType!(Range), size_t)
minCount(alias pred = "a < b", Range)(Range range) minCount(alias pred = "a < b", Range)(Range range)
{ {
if (range.empty) return typeof(return)(); if (range.empty) return typeof(return)();
auto p = &(range.front); auto p = &(range.front());
size_t occurrences = 1; size_t occurrences = 1;
for (range.popFront; !range.empty; range.popFront) for (range.popFront; !range.empty; range.popFront)
{ {
@ -1966,7 +1966,7 @@ minCount(alias pred = "a < b", Range)(Range range)
if (binaryFun!(pred)(range.front, *p)) if (binaryFun!(pred)(range.front, *p))
{ {
// change the min // change the min
p = &(range.front); p = &(range.front());
occurrences = 1; occurrences = 1;
} }
else else
@ -3849,22 +3849,16 @@ void topNIndex(
Range, RangeIndex)(Range r, RangeIndex index, bool sorted = false) Range, RangeIndex)(Range r, RangeIndex index, bool sorted = false)
if (isIntegral!(ElementType!(RangeIndex))) if (isIntegral!(ElementType!(RangeIndex)))
{ {
{ enforce(ElementType!(RangeIndex).max >= index.length,
size_t i; "Index type too small");
enforce(ElementType!(RangeIndex).max >= index.length,
"Index type too small");
foreach (ref e; index) e = cast(typeof(e)) i++;
}
bool indirectLess(ElementType!(RangeIndex) a, ElementType!(RangeIndex) b) bool indirectLess(ElementType!(RangeIndex) a, ElementType!(RangeIndex) b)
{ {
return binaryFun!(less)(r[a], r[b]); return binaryFun!(less)(r[a], r[b]);
} }
auto heap = BinaryHeap!(RangeIndex, indirectLess)(index); auto heap = BinaryHeap!(RangeIndex, indirectLess)(index, 0);
foreach (i; index.length .. r.length) foreach (i; 0 .. r.length)
{ {
if (!indirectLess(cast(ElementType!(RangeIndex)) i, heap.top)) continue; heap.conditionalPut(cast(ElementType!RangeIndex) i);
// replace the top with e
heap.replaceTop(cast(ElementType!(RangeIndex)) i);
} }
if (sorted) heap.pop(heap.length); if (sorted) heap.pop(heap.length);
} }
@ -3875,21 +3869,15 @@ void topNIndex(
Range, RangeIndex)(Range r, RangeIndex index, bool sorted = false) Range, RangeIndex)(Range r, RangeIndex index, bool sorted = false)
if (is(ElementType!(RangeIndex) == ElementType!(Range)*)) if (is(ElementType!(RangeIndex) == ElementType!(Range)*))
{ {
{
size_t i;
foreach (ref e; index) e = &r[i++];
}
static bool indirectLess(const ElementType!(RangeIndex) a, static bool indirectLess(const ElementType!(RangeIndex) a,
const ElementType!(RangeIndex) b) const ElementType!(RangeIndex) b)
{ {
return binaryFun!(less)(*a, *b); return binaryFun!less(*a, *b);
} }
auto heap = BinaryHeap!(RangeIndex, indirectLess)(index); auto heap = BinaryHeap!(RangeIndex, indirectLess)(index, 0);
foreach (i; index.length .. r.length) foreach (i; 0 .. r.length)
{ {
if (!indirectLess(&r[i], heap.top)) continue; heap.conditionalPut(&r[i]);
// replace the top with e
heap.replaceTop(&r[i]);
} }
if (sorted) heap.pop(heap.length); if (sorted) heap.pop(heap.length);
} }
@ -3908,7 +3896,7 @@ unittest
auto b = new ubyte[5]; auto b = new ubyte[5];
topNIndex!("a > b")(a, b, true); topNIndex!("a > b")(a, b, true);
//foreach (e; b) writeln(e, ":", a[e]); //foreach (e; b) writeln(e, ":", a[e]);
assert(b == [ cast(ubyte) 0, 2, 1, 6, 5]); assert(b == [ cast(ubyte) 0, 2, 1, 6, 5], text(b));
} }
} }
/+ /+
@ -4377,12 +4365,13 @@ assert(h.length == 0);
*/ */
struct BinaryHeap(Range, alias less = "a < b") if (isRandomAccessRange!(Range)) struct BinaryHeap(Range, alias less = "a < b") if (isRandomAccessRange!(Range))
{ {
private size_t _length;
private Range _store; private Range _store;
private alias binaryFun!(less) comp; private alias binaryFun!(less) comp;
//@@@BUG //@@@BUG
//private static void heapify(Range r, size_t i) //private static void heapify(Range r, size_t i)
void heapify(Range r, size_t i) public void heapify(Range r, size_t i = 0)
{ {
auto b = 0; auto b = 0;
for (;;) for (;;)
@ -4390,7 +4379,7 @@ struct BinaryHeap(Range, alias less = "a < b") if (isRandomAccessRange!(Range))
auto left = b + (i - b) * 2 + 1, right = left + 1; auto left = b + (i - b) * 2 + 1, right = left + 1;
if (right == r.length) if (right == r.length)
{ {
if (binaryFun!(less)(r[i], r[left])) swap(r[i], r[left]); if (comp(r[i], r[left])) swap(r[i], r[left]);
return; return;
} }
if (right > r.length) return; if (right > r.length) return;
@ -4404,24 +4393,49 @@ struct BinaryHeap(Range, alias less = "a < b") if (isRandomAccessRange!(Range))
} }
} }
private void pop(Range store) /*static*/ void pop(Range store)
{ {
if (store.length <= 1) return; assert(!store.empty);
auto newEnd = store.length - 1; if (store.length == 1) return;
swap(store.front, store[newEnd]); swap(store.front, store.back);
heapify(store[0 .. newEnd], 0); heapify(store[0 .. store.length - 1]);
}
private void assertValid()
{
debug
{
if (_length < 2) return;
for (size_t n = _length - 1; n >= 1; --n)
{
auto parentIdx = (n - 1) / 2;
assert(!comp(_store[parentIdx], _store[n]), text(n));
}
}
} }
public: public:
/** /**
Converts the range $(D r) into a heap. Performs $(BIGOH r.length) Converts the range $(D r) into a heap. If $(D initialSize) is
evaluations of $(D comp). specified, only the first $(D initialSize) elements in $(D r) are
*/ transformed into a heap, after which the heap can grow up to $(D
this(Range r) r.length). Performs $(BIGOH min(r.length, initialSize)) evaluations of
$(D less).
*/
this(Range r, size_t initialSize = size_t.max)
{ {
_store = r; acquire(r, initialSize);
if (_store.length < 2) return; }
auto i = (_store.length - 2) / 2;
/**
Takes ownership of a range.
*/
void acquire(Range r, size_t initialSize = size_t.max)
{
swap(r, _store);
_length = min(_store.length, initialSize);
if (_length < 2) return;
auto i = (_length - 2) / 2;
// @@@BUG: statement not reachable // @@@BUG: statement not reachable
// for (;;) // for (;;)
// { // {
@ -4429,38 +4443,89 @@ evaluations of $(D comp).
// if (i == 0) return; // if (i == 0) return;
// --i; // --i;
// } // }
this.heapify(_store, i); this.heapify(_store[0 .. _length], i);
for (; i-- != 0;) for (; i-- != 0;)
{ {
this.heapify(_store, i); this.heapify(_store[0 .. _length], i);
} }
assertValid;
} }
/** /**
Takes ownership of a range. The old content of the store is destroyed. Clears the heap. Returns the contents of the store (which satisfies
*/ the $(LUCKY heap property)).
void acquire(Range r)
{
swap(r, _store);
}
/**
Clears the heap. Returns the contents of the store.
*/ */
Range release() Range release()
{ {
Range result; Range result;
swap(result, _store); swap(result, _store);
result = result[0 .. _length];
_length = 0;
return result; return result;
} }
/**
Adds one element to the heap. If the length of the heap grows beyond
the length of the range that the heap was associated with, a new range
is allocated.
*/
void push(ElementType!Range value)
{
//assertValid;
if (_length == _store.length)
{
// reallocate
_store.length = (_length + 1) * 2;
}
// no reallocation
_store[_length] = value;
// sink down the element
for (size_t n = _length; n; )
{
auto parentIdx = (n - 1) / 2;
if (!comp(_store[parentIdx], _store[n])) break; // done!
// must swap and continue
swap(_store[parentIdx], _store[n]);
n = parentIdx;
}
++_length;
assertValid;
}
/**
If $(D this.length < this.capacity), call $(D
this.push(value)). Otherwise, if $(D less(value, this.top)), replace
$(D this.top) with $(D value) and adjust the heap to maintain the heap
property. This function is useful in scenarios where the largest $(D
N) of a large set of candidates must be remembered.
Returns: $(D true) if $(D value) was put in the heap, $(D false)
otherwise.
*/
bool conditionalPut(ElementType!Range value)
{
if (_length < _store.length)
{
push(value);
return true;
}
// must replace the top
assert(!_store.empty);
if (!comp(value, _store.front)) return false;
_store.front = value;
heapify(_store[0 .. _length]);
assertValid;
return true;
}
/** /**
Get the _top element (the largest according to the predicate $(D Get the _top element (the largest according to the predicate $(D
less)). less)).
*/ */
ref const(ElementType!(Range)) top() ElementType!Range top()
{ {
return _store[0]; assert(_length);
return _store.front;
} }
/** /**
@ -4468,10 +4533,10 @@ Pops the largest element (according to the predicate $(D less)).
*/ */
void pop() void pop()
{ {
enforce(_store.length); enforce(_length);
if (_store.length > 1) swap(_store.front, _store.back); if (_length > 1) swap(_store.front, _store[_length - 1]);
_store.popBack; --_length;
heapify(_store, 0); heapify(_store[0 .. _length]);
} }
/** /**
@ -4484,33 +4549,31 @@ heap is sorted and returned.
*/ */
Range pop(size_t n) Range pop(size_t n)
{ {
immutable size_t newSize = n >= _store.length immutable size_t newSize = n >= _length
? (n = _store.length, 0) ? (n = _length, 0)
: _store.length - n; : _length - n;
auto result = _store[newSize .. _store.length]; auto result = _store[newSize .. _length];
while (_store.length > newSize) while (_length > newSize)
{ {
swap(_store.front, _store.back); swap(_store.front, _store[_length - 1]);
_store.popBack; --_length;
heapify(_store, 0); heapify(_store[0 .. _length]);
} }
return result; return result;
} }
/** /**
Replaces the top element (largest according to $(D less)) with $(D Returns the _length of the heap.
newTop). */
*/ size_t length() const
void replaceTop(ElementType!(Range) newTop)
{ {
_store.front = newTop; return _length;
heapify!(less)(_store, 0);
} }
/** /**
Returns the _length of the heap. Returns the length of the range underlying the heap.
*/ */
size_t length() size_t capacity()
{ {
return _store.length; return _store.length;
} }
@ -4521,28 +4584,30 @@ unittest
{ {
// example from "Introduction to Algorithms" Cormen et al., p 146 // example from "Introduction to Algorithms" Cormen et al., p 146
int[] a = [ 4, 1, 3, 2, 16, 9, 10, 14, 8, 7 ]; int[] a = [ 4, 1, 3, 2, 16, 9, 10, 14, 8, 7 ];
//vnBinaryHeap!(int[]) h = BinaryHeap!(int[])(a); BinaryHeap!(int[]) h = BinaryHeap!(int[])(a);
//makeBinaryHeap!(binaryFun!("a < b"))(a); //makeBinaryHeap!(binaryFun!("a < b"))(a);
// assert(a == [ 16, 14, 10, 8, 7, 9, 3, 2, 4, 1 ]); assert(a == [ 16, 14, 10, 8, 7, 9, 3, 2, 4, 1 ]);
}
{
int[] a = [ 4, 1, 3, 2, 16, 9, 10, 14, 8, 7 ];
int[] b = new int[a.length];
BinaryHeap!(int[]) h = BinaryHeap!(int[])(b, 0);
foreach (e; a)
{
h.push(e);
}
assert(b == [ 16, 14, 10, 8, 7, 3, 9, 1, 4, 2 ], text(b));
}
{
int[] a = [ 4, 1, 3, 2, 16, 9, 10, 14, 8, 7 ];
BinaryHeap!(int[]) h = BinaryHeap!(int[])(a);
//makeBinaryHeap!(binaryFun!("a < b"))(a);
auto b = h.pop(5);
assert(b == [ 8, 9, 10, 14, 16 ]);
b = h.pop(5);
assert(b == [ 1, 2, 3, 4, 7 ]);
assert(h.length == 0);
} }
// {
// int[] a = [ 4, 1, 3, 2, 16, 9, 10, 14, 8, 7 ];
// BinaryHeap!(int[]) h = BinaryHeap!(int[])(a);
// //makeBinaryHeap!(binaryFun!("a < b"))(a);
// auto b = h.pop(5);
// assert(b == [ 8, 9, 10, 14, 16 ]);
// b = h.pop(5);
// assert(b == [ 1, 2, 3, 4, 7 ]);
// assert(h.length == 0);
// }
}
version(none) unittest
{
// example from "Introduction to Algorithms" Cormen et al., p 143
int[] a = [ 16, 4, 10, 14, 7, 9, 3, 2, 8, 1 ];
BinaryHeap!(int[]) h = BinaryHeap!(int[])(a);
assert(a == [ 16, 14, 10, 8, 7, 9, 3, 2, 4, 1 ]);
} }
/** /**
@ -4565,30 +4630,12 @@ TRange topNCopy(alias less = "a < b", SRange, TRange)
if (isInputRange!(SRange) && isRandomAccessRange!(TRange) if (isInputRange!(SRange) && isRandomAccessRange!(TRange)
&& hasLength!(TRange) && hasSlicing!(TRange)) && hasLength!(TRange) && hasSlicing!(TRange))
{ {
// make an initial heap in the target auto heap = BinaryHeap!(TRange, less)(target, 0);
foreach (i; 0 .. target.length) foreach (e; source) heap.conditionalPut(e);
{
if (source.empty)
{
target = target[0 .. i];
break;
}
target[i] = source.front;
source.popFront;
}
if (target.empty) return target;
auto heap = BinaryHeap!(TRange, less)(target);
// now copy stuff into the target if it's smaller
for (; !source.empty; source.popFront)
{
if (!binaryFun!(less)(source.front, heap.top)) continue;
heap.replaceTop(source.front);
}
return sorted ? heap.pop(heap.length) : heap.release; return sorted ? heap.pop(heap.length) : heap.release;
} }
version(none) unittest unittest
{ {
int[] a = [ 10, 16, 2, 3, 1, 5, 0 ]; int[] a = [ 10, 16, 2, 3, 1, 5, 0 ];
int[] b = new int[3]; int[] b = new int[3];
@ -4596,7 +4643,7 @@ version(none) unittest
assert(b == [ 0, 1, 2 ]); assert(b == [ 0, 1, 2 ]);
} }
version(none) unittest unittest
{ {
auto r = Random(unpredictableSeed); auto r = Random(unpredictableSeed);
int[] a = new int[uniform(1, 1000, r)]; int[] a = new int[uniform(1, 1000, r)];
@ -5094,6 +5141,115 @@ version(unittest)
} }
} }
// largestPartialIntersection
/**
Given a range of sorted forward ranges $(D ror), copies to $(D tgt)
the elements that are common to most ranges, along with their number
of occurrences. All ranges in $(D ror) are assumed to be sorted by $(D
less). Only the most frequent $(D tgt.length) elements are returned.
Example:
----
// Figure which number can be found in most arrays of the set of
// arrays below.
double[][] a =
[
[ 1, 4, 7, 8 ],
[ 1, 7 ],
[ 1, 7, 8],
[ 4 ],
[ 7 ],
];
auto b = new Tuple!(double, uint)[1];
largestPartialIntersection(a, b);
// First member is the item, second is the occurrence count
assert(b == tuple(7.0, 4u));
----
$(D 7.0) is the correct answer because it occurs in $(D 4) out of the
$(D 5) inputs, more than any other number. The second member of the
resulting tuple is indeed $(D 4) (recording the number of occurrences
of $(D 7.0)). If more of the top-frequent numbers are needed, just
create a larger $(D tgt) range. In the axample above, creating $(D b)
with length $(D 2) yields $(D tuple(1.0, 3u)) in the second position.
The function $(D largestPartialIntersection) is useful for
e.g. searching an $(LUCKY inverted index) for the documents most
likely to contain some terms of interest. The complexity of the search
is $(BIGOH n * log(tgt.length)), where $(D n) is the sum of lengths of
all input ranges. This approach is faster than keeping an associative
array of the occurrences and then selecting its top items, and also
requires less memory ($(D largestPartialIntersection) builds its
result directly in $(D tgt) and requires no extra memory).
*/
void largestPartialIntersection
(alias less = "a < b", RangeOfRanges, Range)
(RangeOfRanges ror, Range tgt, bool sorted = false)
{
alias binaryFun!less comp;
alias ElementType!(ElementType!RangeOfRanges) E;
alias ElementType!Range InfoType;
bool heapComp(InfoType a, InfoType b)
{
return a.field[1] > b.field[1];
}
auto heap = BinaryHeap!(Range, heapComp)(tgt, 0);
for (;;)
{
auto tr = frontTransversal(ror);
if (tr.empty) break;
auto mc = minCount!less(tr);
//auto cm = tuple(mc.field[1], mc.field[0]);
// auto cnt = mc.field[1];
// writeln(min);
// writeln(cnt);
// Put that on the heap
heap.conditionalPut(mc);
// Now remove that minimum element from wherever it occurred
foreach (ref r; ror)
{
if (r.empty) continue;
if (!comp(r.front, mc.field[0])
&& !comp(mc.field[0], r.front))
r.popFront;
}
}
if (sorted) heap.pop(heap.length);
}
unittest
{
double[][] a =
[
[ 1, 4, 7, 8 ],
[ 1, 7 ],
[ 1, 7, 8],
[ 4 ],
[ 7 ],
];
auto b = new Tuple!(double, uint)[2];
largestPartialIntersection(a, b, true);
//sort(b);
//writeln(b);
assert(b[0] == [ tuple(7., 4u), tuple(1., 3u) ][], text(b));
}
unittest
{
string[][] a =
[
[ "1", "4", "7", "8" ],
[ "1", "7" ],
[ "1", "7", "8"],
[ "4" ],
[ "7" ],
];
auto b = new Tuple!(string, uint)[2];
largestPartialIntersection(a, b, true);
//writeln(b);
assert(b == [ tuple("7", 4u), tuple("1", 3u) ][], text(b));
}
/* /*
* Copyright (C) 2004-2006 by Digital Mars, www.digitalmars.com * Copyright (C) 2004-2006 by Digital Mars, www.digitalmars.com
* Written by Andrei Alexandrescu, www.erdani.org * Written by Andrei Alexandrescu, www.erdani.org