631 lines
15 KiB
C
631 lines
15 KiB
C
/*
|
|
* contrib/intarray/_int_gist.c
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include <limits.h>
|
|
|
|
#include "_int.h"
|
|
#include "access/gist.h"
|
|
#include "access/reloptions.h"
|
|
#include "access/stratnum.h"
|
|
|
|
#define GETENTRY(vec,pos) ((ArrayType *) DatumGetPointer((vec)->vector[(pos)].key))
|
|
|
|
/*
|
|
* Control the maximum sparseness of compressed keys.
|
|
*
|
|
* The upper safe bound for this limit is half the maximum allocatable array
|
|
* size. A lower bound would give more guarantees that pathological data
|
|
* wouldn't eat excessive CPU and memory, but at the expense of breaking
|
|
* possibly working (after a fashion) indexes.
|
|
*/
|
|
#define MAXNUMELTS (Min((MaxAllocSize / sizeof(Datum)),((MaxAllocSize - ARR_OVERHEAD_NONULLS(1)) / sizeof(int)))/2)
|
|
/* or: #define MAXNUMELTS 1000000 */
|
|
|
|
/*
|
|
** GiST support methods
|
|
*/
|
|
PG_FUNCTION_INFO_V1(g_int_consistent);
|
|
PG_FUNCTION_INFO_V1(g_int_compress);
|
|
PG_FUNCTION_INFO_V1(g_int_decompress);
|
|
PG_FUNCTION_INFO_V1(g_int_penalty);
|
|
PG_FUNCTION_INFO_V1(g_int_picksplit);
|
|
PG_FUNCTION_INFO_V1(g_int_union);
|
|
PG_FUNCTION_INFO_V1(g_int_same);
|
|
PG_FUNCTION_INFO_V1(g_int_options);
|
|
|
|
|
|
/*
|
|
** The GiST Consistent method for _intments
|
|
** Should return false if for all data items x below entry,
|
|
** the predicate x op query == false, where op is the oper
|
|
** corresponding to strategy in the pg_amop table.
|
|
*/
|
|
Datum
|
|
g_int_consistent(PG_FUNCTION_ARGS)
|
|
{
|
|
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
|
|
ArrayType *query = PG_GETARG_ARRAYTYPE_P_COPY(1);
|
|
StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
|
|
|
|
/* Oid subtype = PG_GETARG_OID(3); */
|
|
bool *recheck = (bool *) PG_GETARG_POINTER(4);
|
|
bool retval;
|
|
|
|
/* this is exact except for RTSameStrategyNumber */
|
|
*recheck = (strategy == RTSameStrategyNumber);
|
|
|
|
if (strategy == BooleanSearchStrategy)
|
|
{
|
|
retval = execconsistent((QUERYTYPE *) query,
|
|
(ArrayType *) DatumGetPointer(entry->key),
|
|
GIST_LEAF(entry));
|
|
|
|
pfree(query);
|
|
PG_RETURN_BOOL(retval);
|
|
}
|
|
|
|
/* sort query for fast search, key is already sorted */
|
|
CHECKARRVALID(query);
|
|
PREPAREARR(query);
|
|
|
|
switch (strategy)
|
|
{
|
|
case RTOverlapStrategyNumber:
|
|
retval = inner_int_overlap((ArrayType *) DatumGetPointer(entry->key),
|
|
query);
|
|
break;
|
|
case RTSameStrategyNumber:
|
|
if (GIST_LEAF(entry))
|
|
DirectFunctionCall3(g_int_same,
|
|
entry->key,
|
|
PointerGetDatum(query),
|
|
PointerGetDatum(&retval));
|
|
else
|
|
retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key),
|
|
query);
|
|
break;
|
|
case RTContainsStrategyNumber:
|
|
case RTOldContainsStrategyNumber:
|
|
retval = inner_int_contains((ArrayType *) DatumGetPointer(entry->key),
|
|
query);
|
|
break;
|
|
case RTContainedByStrategyNumber:
|
|
case RTOldContainedByStrategyNumber:
|
|
|
|
/*
|
|
* This code is unreachable as of intarray 1.4, because the <@
|
|
* operator has been removed from the opclass. We keep it for now
|
|
* to support older versions of the SQL definitions.
|
|
*/
|
|
if (GIST_LEAF(entry))
|
|
retval = inner_int_contains(query,
|
|
(ArrayType *) DatumGetPointer(entry->key));
|
|
else
|
|
{
|
|
/*
|
|
* Unfortunately, because empty arrays could be anywhere in
|
|
* the index, we must search the whole tree.
|
|
*/
|
|
retval = true;
|
|
}
|
|
break;
|
|
default:
|
|
retval = false;
|
|
}
|
|
pfree(query);
|
|
PG_RETURN_BOOL(retval);
|
|
}
|
|
|
|
Datum
|
|
g_int_union(PG_FUNCTION_ARGS)
|
|
{
|
|
GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
|
|
int *size = (int *) PG_GETARG_POINTER(1);
|
|
int32 i,
|
|
*ptr;
|
|
ArrayType *res;
|
|
int totlen = 0;
|
|
|
|
for (i = 0; i < entryvec->n; i++)
|
|
{
|
|
ArrayType *ent = GETENTRY(entryvec, i);
|
|
|
|
CHECKARRVALID(ent);
|
|
totlen += ARRNELEMS(ent);
|
|
}
|
|
|
|
res = new_intArrayType(totlen);
|
|
ptr = ARRPTR(res);
|
|
|
|
for (i = 0; i < entryvec->n; i++)
|
|
{
|
|
ArrayType *ent = GETENTRY(entryvec, i);
|
|
int nel;
|
|
|
|
nel = ARRNELEMS(ent);
|
|
memcpy(ptr, ARRPTR(ent), nel * sizeof(int32));
|
|
ptr += nel;
|
|
}
|
|
|
|
QSORT(res, 1);
|
|
res = _int_unique(res);
|
|
*size = VARSIZE(res);
|
|
PG_RETURN_POINTER(res);
|
|
}
|
|
|
|
/*
|
|
** GiST Compress and Decompress methods
|
|
*/
|
|
Datum
|
|
g_int_compress(PG_FUNCTION_ARGS)
|
|
{
|
|
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
|
|
GISTENTRY *retval;
|
|
ArrayType *r;
|
|
int num_ranges = G_INT_GET_NUMRANGES();
|
|
int len,
|
|
lenr;
|
|
int *dr;
|
|
int i,
|
|
j,
|
|
cand;
|
|
int64 min;
|
|
|
|
if (entry->leafkey)
|
|
{
|
|
r = DatumGetArrayTypePCopy(entry->key);
|
|
CHECKARRVALID(r);
|
|
PREPAREARR(r);
|
|
|
|
if (ARRNELEMS(r) >= 2 * num_ranges)
|
|
elog(NOTICE, "input array is too big (%d maximum allowed, %d current), use gist__intbig_ops opclass instead",
|
|
2 * num_ranges - 1, ARRNELEMS(r));
|
|
|
|
retval = palloc(sizeof(GISTENTRY));
|
|
gistentryinit(*retval, PointerGetDatum(r),
|
|
entry->rel, entry->page, entry->offset, false);
|
|
|
|
PG_RETURN_POINTER(retval);
|
|
}
|
|
|
|
/*
|
|
* leaf entries never compress one more time, only when entry->leafkey
|
|
* ==true, so now we work only with internal keys
|
|
*/
|
|
|
|
r = DatumGetArrayTypeP(entry->key);
|
|
CHECKARRVALID(r);
|
|
if (ARRISEMPTY(r))
|
|
{
|
|
if (r != (ArrayType *) DatumGetPointer(entry->key))
|
|
pfree(r);
|
|
PG_RETURN_POINTER(entry);
|
|
}
|
|
|
|
if ((len = ARRNELEMS(r)) >= 2 * num_ranges)
|
|
{ /* compress */
|
|
if (r == (ArrayType *) DatumGetPointer(entry->key))
|
|
r = DatumGetArrayTypePCopy(entry->key);
|
|
r = resize_intArrayType(r, 2 * (len));
|
|
|
|
dr = ARRPTR(r);
|
|
|
|
/*
|
|
* "len" at this point is the number of ranges we will construct.
|
|
* "lenr" is the number of ranges we must eventually remove by
|
|
* merging, we must be careful to remove no more than this number.
|
|
*/
|
|
lenr = len - num_ranges;
|
|
|
|
/*
|
|
* Initially assume we can merge consecutive ints into a range. but we
|
|
* must count every value removed and stop when lenr runs out
|
|
*/
|
|
for (j = i = len - 1; i > 0 && lenr > 0; i--, j--)
|
|
{
|
|
int r_end = dr[i];
|
|
int r_start = r_end;
|
|
|
|
while (i > 0 && lenr > 0 && dr[i - 1] == r_start - 1)
|
|
--r_start, --i, --lenr;
|
|
dr[2 * j] = r_start;
|
|
dr[2 * j + 1] = r_end;
|
|
}
|
|
/* just copy the rest, if any, as trivial ranges */
|
|
for (; i >= 0; i--, j--)
|
|
dr[2 * j] = dr[2 * j + 1] = dr[i];
|
|
|
|
if (++j)
|
|
{
|
|
/*
|
|
* shunt everything down to start at the right place
|
|
*/
|
|
memmove((void *) &dr[0], (void *) &dr[2 * j], 2 * (len - j) * sizeof(int32));
|
|
}
|
|
|
|
/*
|
|
* make "len" be number of array elements, not ranges
|
|
*/
|
|
len = 2 * (len - j);
|
|
cand = 1;
|
|
while (len > num_ranges * 2)
|
|
{
|
|
min = PG_INT64_MAX;
|
|
for (i = 2; i < len; i += 2)
|
|
if (min > ((int64) dr[i] - (int64) dr[i - 1]))
|
|
{
|
|
min = ((int64) dr[i] - (int64) dr[i - 1]);
|
|
cand = i;
|
|
}
|
|
memmove((void *) &dr[cand - 1], (void *) &dr[cand + 1], (len - cand - 1) * sizeof(int32));
|
|
len -= 2;
|
|
}
|
|
|
|
/*
|
|
* check sparseness of result
|
|
*/
|
|
lenr = internal_size(dr, len);
|
|
if (lenr < 0 || lenr > MAXNUMELTS)
|
|
ereport(ERROR,
|
|
(errmsg("data is too sparse, recreate index using gist__intbig_ops opclass instead")));
|
|
|
|
r = resize_intArrayType(r, len);
|
|
retval = palloc(sizeof(GISTENTRY));
|
|
gistentryinit(*retval, PointerGetDatum(r),
|
|
entry->rel, entry->page, entry->offset, false);
|
|
PG_RETURN_POINTER(retval);
|
|
}
|
|
else
|
|
PG_RETURN_POINTER(entry);
|
|
}
|
|
|
|
Datum
|
|
g_int_decompress(PG_FUNCTION_ARGS)
|
|
{
|
|
GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
|
|
GISTENTRY *retval;
|
|
ArrayType *r;
|
|
int num_ranges = G_INT_GET_NUMRANGES();
|
|
int *dr,
|
|
lenr;
|
|
ArrayType *in;
|
|
int lenin;
|
|
int *din;
|
|
int i,
|
|
j;
|
|
|
|
in = DatumGetArrayTypeP(entry->key);
|
|
|
|
CHECKARRVALID(in);
|
|
if (ARRISEMPTY(in))
|
|
{
|
|
if (in != (ArrayType *) DatumGetPointer(entry->key))
|
|
{
|
|
retval = palloc(sizeof(GISTENTRY));
|
|
gistentryinit(*retval, PointerGetDatum(in),
|
|
entry->rel, entry->page, entry->offset, false);
|
|
PG_RETURN_POINTER(retval);
|
|
}
|
|
|
|
PG_RETURN_POINTER(entry);
|
|
}
|
|
|
|
lenin = ARRNELEMS(in);
|
|
|
|
if (lenin < 2 * num_ranges)
|
|
{ /* not compressed value */
|
|
if (in != (ArrayType *) DatumGetPointer(entry->key))
|
|
{
|
|
retval = palloc(sizeof(GISTENTRY));
|
|
gistentryinit(*retval, PointerGetDatum(in),
|
|
entry->rel, entry->page, entry->offset, false);
|
|
|
|
PG_RETURN_POINTER(retval);
|
|
}
|
|
PG_RETURN_POINTER(entry);
|
|
}
|
|
|
|
din = ARRPTR(in);
|
|
lenr = internal_size(din, lenin);
|
|
if (lenr < 0 || lenr > MAXNUMELTS)
|
|
ereport(ERROR,
|
|
(errmsg("compressed array is too big, recreate index using gist__intbig_ops opclass instead")));
|
|
|
|
r = new_intArrayType(lenr);
|
|
dr = ARRPTR(r);
|
|
|
|
for (i = 0; i < lenin; i += 2)
|
|
for (j = din[i]; j <= din[i + 1]; j++)
|
|
if ((!i) || *(dr - 1) != j)
|
|
*dr++ = j;
|
|
|
|
if (in != (ArrayType *) DatumGetPointer(entry->key))
|
|
pfree(in);
|
|
retval = palloc(sizeof(GISTENTRY));
|
|
gistentryinit(*retval, PointerGetDatum(r),
|
|
entry->rel, entry->page, entry->offset, false);
|
|
|
|
PG_RETURN_POINTER(retval);
|
|
}
|
|
|
|
/*
|
|
** The GiST Penalty method for _intments
|
|
*/
|
|
Datum
|
|
g_int_penalty(PG_FUNCTION_ARGS)
|
|
{
|
|
GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0);
|
|
GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1);
|
|
float *result = (float *) PG_GETARG_POINTER(2);
|
|
ArrayType *ud;
|
|
float tmp1,
|
|
tmp2;
|
|
|
|
ud = inner_int_union((ArrayType *) DatumGetPointer(origentry->key),
|
|
(ArrayType *) DatumGetPointer(newentry->key));
|
|
rt__int_size(ud, &tmp1);
|
|
rt__int_size((ArrayType *) DatumGetPointer(origentry->key), &tmp2);
|
|
*result = tmp1 - tmp2;
|
|
pfree(ud);
|
|
|
|
PG_RETURN_POINTER(result);
|
|
}
|
|
|
|
|
|
|
|
Datum
|
|
g_int_same(PG_FUNCTION_ARGS)
|
|
{
|
|
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0);
|
|
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1);
|
|
bool *result = (bool *) PG_GETARG_POINTER(2);
|
|
int32 n = ARRNELEMS(a);
|
|
int32 *da,
|
|
*db;
|
|
|
|
CHECKARRVALID(a);
|
|
CHECKARRVALID(b);
|
|
|
|
if (n != ARRNELEMS(b))
|
|
{
|
|
*result = false;
|
|
PG_RETURN_POINTER(result);
|
|
}
|
|
*result = true;
|
|
da = ARRPTR(a);
|
|
db = ARRPTR(b);
|
|
while (n--)
|
|
{
|
|
if (*da++ != *db++)
|
|
{
|
|
*result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
PG_RETURN_POINTER(result);
|
|
}
|
|
|
|
/*****************************************************************
|
|
** Common GiST Method
|
|
*****************************************************************/
|
|
|
|
typedef struct
|
|
{
|
|
OffsetNumber pos;
|
|
float cost;
|
|
} SPLITCOST;
|
|
|
|
static int
|
|
comparecost(const void *a, const void *b)
|
|
{
|
|
if (((const SPLITCOST *) a)->cost == ((const SPLITCOST *) b)->cost)
|
|
return 0;
|
|
else
|
|
return (((const SPLITCOST *) a)->cost > ((const SPLITCOST *) b)->cost) ? 1 : -1;
|
|
}
|
|
|
|
/*
|
|
** The GiST PickSplit method for _intments
|
|
** We use Guttman's poly time split algorithm
|
|
*/
|
|
Datum
|
|
g_int_picksplit(PG_FUNCTION_ARGS)
|
|
{
|
|
GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0);
|
|
GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1);
|
|
OffsetNumber i,
|
|
j;
|
|
ArrayType *datum_alpha,
|
|
*datum_beta;
|
|
ArrayType *datum_l,
|
|
*datum_r;
|
|
ArrayType *union_d,
|
|
*union_dl,
|
|
*union_dr;
|
|
ArrayType *inter_d;
|
|
bool firsttime;
|
|
float size_alpha,
|
|
size_beta,
|
|
size_union,
|
|
size_inter;
|
|
float size_waste,
|
|
waste;
|
|
float size_l,
|
|
size_r;
|
|
int nbytes;
|
|
OffsetNumber seed_1 = 0,
|
|
seed_2 = 0;
|
|
OffsetNumber *left,
|
|
*right;
|
|
OffsetNumber maxoff;
|
|
SPLITCOST *costvector;
|
|
|
|
#ifdef GIST_DEBUG
|
|
elog(DEBUG3, "--------picksplit %d", entryvec->n);
|
|
#endif
|
|
|
|
maxoff = entryvec->n - 2;
|
|
nbytes = (maxoff + 2) * sizeof(OffsetNumber);
|
|
v->spl_left = (OffsetNumber *) palloc(nbytes);
|
|
v->spl_right = (OffsetNumber *) palloc(nbytes);
|
|
|
|
firsttime = true;
|
|
waste = 0.0;
|
|
for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
|
|
{
|
|
datum_alpha = GETENTRY(entryvec, i);
|
|
for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
|
|
{
|
|
datum_beta = GETENTRY(entryvec, j);
|
|
|
|
/* compute the wasted space by unioning these guys */
|
|
/* size_waste = size_union - size_inter; */
|
|
union_d = inner_int_union(datum_alpha, datum_beta);
|
|
rt__int_size(union_d, &size_union);
|
|
inter_d = inner_int_inter(datum_alpha, datum_beta);
|
|
rt__int_size(inter_d, &size_inter);
|
|
size_waste = size_union - size_inter;
|
|
|
|
pfree(union_d);
|
|
pfree(inter_d);
|
|
|
|
/*
|
|
* are these a more promising split that what we've already seen?
|
|
*/
|
|
|
|
if (size_waste > waste || firsttime)
|
|
{
|
|
waste = size_waste;
|
|
seed_1 = i;
|
|
seed_2 = j;
|
|
firsttime = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
left = v->spl_left;
|
|
v->spl_nleft = 0;
|
|
right = v->spl_right;
|
|
v->spl_nright = 0;
|
|
if (seed_1 == 0 || seed_2 == 0)
|
|
{
|
|
seed_1 = 1;
|
|
seed_2 = 2;
|
|
}
|
|
|
|
datum_alpha = GETENTRY(entryvec, seed_1);
|
|
datum_l = copy_intArrayType(datum_alpha);
|
|
rt__int_size(datum_l, &size_l);
|
|
datum_beta = GETENTRY(entryvec, seed_2);
|
|
datum_r = copy_intArrayType(datum_beta);
|
|
rt__int_size(datum_r, &size_r);
|
|
|
|
maxoff = OffsetNumberNext(maxoff);
|
|
|
|
/*
|
|
* sort entries
|
|
*/
|
|
costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff);
|
|
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
|
|
{
|
|
costvector[i - 1].pos = i;
|
|
datum_alpha = GETENTRY(entryvec, i);
|
|
union_d = inner_int_union(datum_l, datum_alpha);
|
|
rt__int_size(union_d, &size_alpha);
|
|
pfree(union_d);
|
|
union_d = inner_int_union(datum_r, datum_alpha);
|
|
rt__int_size(union_d, &size_beta);
|
|
pfree(union_d);
|
|
costvector[i - 1].cost = Abs((size_alpha - size_l) - (size_beta - size_r));
|
|
}
|
|
qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost);
|
|
|
|
/*
|
|
* Now split up the regions between the two seeds. An important property
|
|
* of this split algorithm is that the split vector v has the indices of
|
|
* items to be split in order in its left and right vectors. We exploit
|
|
* this property by doing a merge in the code that actually splits the
|
|
* page.
|
|
*
|
|
* For efficiency, we also place the new index tuple in this loop. This is
|
|
* handled at the very end, when we have placed all the existing tuples
|
|
* and i == maxoff + 1.
|
|
*/
|
|
|
|
|
|
for (j = 0; j < maxoff; j++)
|
|
{
|
|
i = costvector[j].pos;
|
|
|
|
/*
|
|
* If we've already decided where to place this item, just put it on
|
|
* the right list. Otherwise, we need to figure out which page needs
|
|
* the least enlargement in order to store the item.
|
|
*/
|
|
|
|
if (i == seed_1)
|
|
{
|
|
*left++ = i;
|
|
v->spl_nleft++;
|
|
continue;
|
|
}
|
|
else if (i == seed_2)
|
|
{
|
|
*right++ = i;
|
|
v->spl_nright++;
|
|
continue;
|
|
}
|
|
|
|
/* okay, which page needs least enlargement? */
|
|
datum_alpha = GETENTRY(entryvec, i);
|
|
union_dl = inner_int_union(datum_l, datum_alpha);
|
|
union_dr = inner_int_union(datum_r, datum_alpha);
|
|
rt__int_size(union_dl, &size_alpha);
|
|
rt__int_size(union_dr, &size_beta);
|
|
|
|
/* pick which page to add it to */
|
|
if (size_alpha - size_l < size_beta - size_r + WISH_F(v->spl_nleft, v->spl_nright, 0.01))
|
|
{
|
|
pfree(datum_l);
|
|
pfree(union_dr);
|
|
datum_l = union_dl;
|
|
size_l = size_alpha;
|
|
*left++ = i;
|
|
v->spl_nleft++;
|
|
}
|
|
else
|
|
{
|
|
pfree(datum_r);
|
|
pfree(union_dl);
|
|
datum_r = union_dr;
|
|
size_r = size_beta;
|
|
*right++ = i;
|
|
v->spl_nright++;
|
|
}
|
|
}
|
|
pfree(costvector);
|
|
*right = *left = FirstOffsetNumber;
|
|
|
|
v->spl_ldatum = PointerGetDatum(datum_l);
|
|
v->spl_rdatum = PointerGetDatum(datum_r);
|
|
|
|
PG_RETURN_POINTER(v);
|
|
}
|
|
|
|
Datum
|
|
g_int_options(PG_FUNCTION_ARGS)
|
|
{
|
|
local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);
|
|
|
|
init_local_reloptions(relopts, sizeof(GISTIntArrayOptions));
|
|
add_local_int_reloption(relopts, "numranges",
|
|
"number of ranges for compression",
|
|
G_INT_NUMRANGES_DEFAULT, 1, G_INT_NUMRANGES_MAX,
|
|
offsetof(GISTIntArrayOptions, num_ranges));
|
|
|
|
PG_RETURN_VOID();
|
|
}
|