418 lines
11 KiB
C
418 lines
11 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* heap_surgery.c
|
|
* Functions to perform surgery on the damaged heap table.
|
|
*
|
|
* Copyright (c) 2020-2022, PostgreSQL Global Development Group
|
|
*
|
|
* IDENTIFICATION
|
|
* contrib/pg_surgery/heap_surgery.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/heapam.h"
|
|
#include "access/visibilitymap.h"
|
|
#include "access/xloginsert.h"
|
|
#include "catalog/pg_am_d.h"
|
|
#include "catalog/pg_proc_d.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "utils/acl.h"
|
|
#include "utils/rel.h"
|
|
|
|
PG_MODULE_MAGIC;
|
|
|
|
/* Options to forcefully change the state of a heap tuple. */
|
|
typedef enum HeapTupleForceOption
|
|
{
|
|
HEAP_FORCE_KILL,
|
|
HEAP_FORCE_FREEZE
|
|
} HeapTupleForceOption;
|
|
|
|
PG_FUNCTION_INFO_V1(heap_force_kill);
|
|
PG_FUNCTION_INFO_V1(heap_force_freeze);
|
|
|
|
static int32 tidcmp(const void *a, const void *b);
|
|
static Datum heap_force_common(FunctionCallInfo fcinfo,
|
|
HeapTupleForceOption heap_force_opt);
|
|
static void sanity_check_tid_array(ArrayType *ta, int *ntids);
|
|
static BlockNumber find_tids_one_page(ItemPointer tids, int ntids,
|
|
OffsetNumber *next_start_ptr);
|
|
|
|
/*-------------------------------------------------------------------------
|
|
* heap_force_kill()
|
|
*
|
|
* Force kill the tuple(s) pointed to by the item pointer(s) stored in the
|
|
* given TID array.
|
|
*
|
|
* Usage: SELECT heap_force_kill(regclass, tid[]);
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
Datum
|
|
heap_force_kill(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(heap_force_common(fcinfo, HEAP_FORCE_KILL));
|
|
}
|
|
|
|
/*-------------------------------------------------------------------------
|
|
* heap_force_freeze()
|
|
*
|
|
* Force freeze the tuple(s) pointed to by the item pointer(s) stored in the
|
|
* given TID array.
|
|
*
|
|
* Usage: SELECT heap_force_freeze(regclass, tid[]);
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
Datum
|
|
heap_force_freeze(PG_FUNCTION_ARGS)
|
|
{
|
|
PG_RETURN_DATUM(heap_force_common(fcinfo, HEAP_FORCE_FREEZE));
|
|
}
|
|
|
|
/*-------------------------------------------------------------------------
|
|
* heap_force_common()
|
|
*
|
|
* Common code for heap_force_kill and heap_force_freeze
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
static Datum
|
|
heap_force_common(FunctionCallInfo fcinfo, HeapTupleForceOption heap_force_opt)
|
|
{
|
|
Oid relid = PG_GETARG_OID(0);
|
|
ArrayType *ta = PG_GETARG_ARRAYTYPE_P_COPY(1);
|
|
ItemPointer tids;
|
|
int ntids,
|
|
nblocks;
|
|
Relation rel;
|
|
OffsetNumber curr_start_ptr,
|
|
next_start_ptr;
|
|
bool include_this_tid[MaxHeapTuplesPerPage];
|
|
|
|
if (RecoveryInProgress())
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
|
errmsg("recovery is in progress"),
|
|
errhint("heap surgery functions cannot be executed during recovery.")));
|
|
|
|
/* Check inputs. */
|
|
sanity_check_tid_array(ta, &ntids);
|
|
|
|
rel = relation_open(relid, RowExclusiveLock);
|
|
|
|
/*
|
|
* Check target relation.
|
|
*/
|
|
if (!RELKIND_HAS_TABLE_AM(rel->rd_rel->relkind))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
|
errmsg("cannot operate on relation \"%s\"",
|
|
RelationGetRelationName(rel)),
|
|
errdetail_relkind_not_supported(rel->rd_rel->relkind)));
|
|
|
|
if (rel->rd_rel->relam != HEAP_TABLE_AM_OID)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("only heap AM is supported")));
|
|
|
|
/* Must be owner of the table or superuser. */
|
|
if (!pg_class_ownercheck(RelationGetRelid(rel), GetUserId()))
|
|
aclcheck_error(ACLCHECK_NOT_OWNER,
|
|
get_relkind_objtype(rel->rd_rel->relkind),
|
|
RelationGetRelationName(rel));
|
|
|
|
tids = ((ItemPointer) ARR_DATA_PTR(ta));
|
|
|
|
/*
|
|
* If there is more than one TID in the array, sort them so that we can
|
|
* easily fetch all the TIDs belonging to one particular page from the
|
|
* array.
|
|
*/
|
|
if (ntids > 1)
|
|
qsort((void *) tids, ntids, sizeof(ItemPointerData), tidcmp);
|
|
|
|
curr_start_ptr = next_start_ptr = 0;
|
|
nblocks = RelationGetNumberOfBlocks(rel);
|
|
|
|
/*
|
|
* Loop, performing the necessary actions for each block.
|
|
*/
|
|
while (next_start_ptr != ntids)
|
|
{
|
|
Buffer buf;
|
|
Buffer vmbuf = InvalidBuffer;
|
|
Page page;
|
|
BlockNumber blkno;
|
|
OffsetNumber curoff;
|
|
OffsetNumber maxoffset;
|
|
int i;
|
|
bool did_modify_page = false;
|
|
bool did_modify_vm = false;
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
/*
|
|
* Find all the TIDs belonging to one particular page starting from
|
|
* next_start_ptr and process them one by one.
|
|
*/
|
|
blkno = find_tids_one_page(tids, ntids, &next_start_ptr);
|
|
|
|
/* Check whether the block number is valid. */
|
|
if (blkno >= nblocks)
|
|
{
|
|
/* Update the current_start_ptr before moving to the next page. */
|
|
curr_start_ptr = next_start_ptr;
|
|
|
|
ereport(NOTICE,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("skipping block %u for relation \"%s\" because the block number is out of range",
|
|
blkno, RelationGetRelationName(rel))));
|
|
continue;
|
|
}
|
|
|
|
buf = ReadBuffer(rel, blkno);
|
|
LockBufferForCleanup(buf);
|
|
|
|
page = BufferGetPage(buf);
|
|
|
|
maxoffset = PageGetMaxOffsetNumber(page);
|
|
|
|
/*
|
|
* Figure out which TIDs we are going to process and which ones we are
|
|
* going to skip.
|
|
*/
|
|
memset(include_this_tid, 0, sizeof(include_this_tid));
|
|
for (i = curr_start_ptr; i < next_start_ptr; i++)
|
|
{
|
|
OffsetNumber offno = ItemPointerGetOffsetNumberNoCheck(&tids[i]);
|
|
ItemId itemid;
|
|
|
|
/* Check whether the offset number is valid. */
|
|
if (offno == InvalidOffsetNumber || offno > maxoffset)
|
|
{
|
|
ereport(NOTICE,
|
|
errmsg("skipping tid (%u, %u) for relation \"%s\" because the item number is out of range",
|
|
blkno, offno, RelationGetRelationName(rel)));
|
|
continue;
|
|
}
|
|
|
|
itemid = PageGetItemId(page, offno);
|
|
|
|
/* Only accept an item ID that is used. */
|
|
if (ItemIdIsRedirected(itemid))
|
|
{
|
|
ereport(NOTICE,
|
|
errmsg("skipping tid (%u, %u) for relation \"%s\" because it redirects to item %u",
|
|
blkno, offno, RelationGetRelationName(rel),
|
|
ItemIdGetRedirect(itemid)));
|
|
continue;
|
|
}
|
|
else if (ItemIdIsDead(itemid))
|
|
{
|
|
ereport(NOTICE,
|
|
(errmsg("skipping tid (%u, %u) for relation \"%s\" because it is marked dead",
|
|
blkno, offno, RelationGetRelationName(rel))));
|
|
continue;
|
|
}
|
|
else if (!ItemIdIsUsed(itemid))
|
|
{
|
|
ereport(NOTICE,
|
|
(errmsg("skipping tid (%u, %u) for relation \"%s\" because it is marked unused",
|
|
blkno, offno, RelationGetRelationName(rel))));
|
|
continue;
|
|
}
|
|
|
|
/* Mark it for processing. */
|
|
Assert(offno < MaxHeapTuplesPerPage);
|
|
include_this_tid[offno] = true;
|
|
}
|
|
|
|
/*
|
|
* Before entering the critical section, pin the visibility map page
|
|
* if it appears to be necessary.
|
|
*/
|
|
if (heap_force_opt == HEAP_FORCE_KILL && PageIsAllVisible(page))
|
|
visibilitymap_pin(rel, blkno, &vmbuf);
|
|
|
|
/* No ereport(ERROR) from here until all the changes are logged. */
|
|
START_CRIT_SECTION();
|
|
|
|
for (curoff = FirstOffsetNumber; curoff <= maxoffset;
|
|
curoff = OffsetNumberNext(curoff))
|
|
{
|
|
ItemId itemid;
|
|
|
|
if (!include_this_tid[curoff])
|
|
continue;
|
|
|
|
itemid = PageGetItemId(page, curoff);
|
|
Assert(ItemIdIsNormal(itemid));
|
|
|
|
did_modify_page = true;
|
|
|
|
if (heap_force_opt == HEAP_FORCE_KILL)
|
|
{
|
|
ItemIdSetDead(itemid);
|
|
|
|
/*
|
|
* If the page is marked all-visible, we must clear
|
|
* PD_ALL_VISIBLE flag on the page header and an all-visible
|
|
* bit on the visibility map corresponding to the page.
|
|
*/
|
|
if (PageIsAllVisible(page))
|
|
{
|
|
PageClearAllVisible(page);
|
|
visibilitymap_clear(rel, blkno, vmbuf,
|
|
VISIBILITYMAP_VALID_BITS);
|
|
did_modify_vm = true;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
HeapTupleHeader htup;
|
|
|
|
Assert(heap_force_opt == HEAP_FORCE_FREEZE);
|
|
|
|
htup = (HeapTupleHeader) PageGetItem(page, itemid);
|
|
|
|
/*
|
|
* Reset all visibility-related fields of the tuple. This
|
|
* logic should mimic heap_execute_freeze_tuple(), but we
|
|
* choose to reset xmin and ctid just to be sure that no
|
|
* potentially-garbled data is left behind.
|
|
*/
|
|
ItemPointerSet(&htup->t_ctid, blkno, curoff);
|
|
HeapTupleHeaderSetXmin(htup, FrozenTransactionId);
|
|
HeapTupleHeaderSetXmax(htup, InvalidTransactionId);
|
|
if (htup->t_infomask & HEAP_MOVED)
|
|
{
|
|
if (htup->t_infomask & HEAP_MOVED_OFF)
|
|
HeapTupleHeaderSetXvac(htup, InvalidTransactionId);
|
|
else
|
|
HeapTupleHeaderSetXvac(htup, FrozenTransactionId);
|
|
}
|
|
|
|
/*
|
|
* Clear all the visibility-related bits of this tuple and
|
|
* mark it as frozen. Also, get rid of HOT_UPDATED and
|
|
* KEYS_UPDATES bits.
|
|
*/
|
|
htup->t_infomask &= ~HEAP_XACT_MASK;
|
|
htup->t_infomask |= (HEAP_XMIN_FROZEN | HEAP_XMAX_INVALID);
|
|
htup->t_infomask2 &= ~HEAP_HOT_UPDATED;
|
|
htup->t_infomask2 &= ~HEAP_KEYS_UPDATED;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If the page was modified, only then, we mark the buffer dirty or do
|
|
* the WAL logging.
|
|
*/
|
|
if (did_modify_page)
|
|
{
|
|
/* Mark buffer dirty before we write WAL. */
|
|
MarkBufferDirty(buf);
|
|
|
|
/* XLOG stuff */
|
|
if (RelationNeedsWAL(rel))
|
|
log_newpage_buffer(buf, true);
|
|
}
|
|
|
|
/* WAL log the VM page if it was modified. */
|
|
if (did_modify_vm && RelationNeedsWAL(rel))
|
|
log_newpage_buffer(vmbuf, false);
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
UnlockReleaseBuffer(buf);
|
|
|
|
if (vmbuf != InvalidBuffer)
|
|
ReleaseBuffer(vmbuf);
|
|
|
|
/* Update the current_start_ptr before moving to the next page. */
|
|
curr_start_ptr = next_start_ptr;
|
|
}
|
|
|
|
relation_close(rel, RowExclusiveLock);
|
|
|
|
pfree(ta);
|
|
|
|
PG_RETURN_VOID();
|
|
}
|
|
|
|
/*-------------------------------------------------------------------------
|
|
* tidcmp()
|
|
*
|
|
* Compare two item pointers, return -1, 0, or +1.
|
|
*
|
|
* See ItemPointerCompare for details.
|
|
* ------------------------------------------------------------------------
|
|
*/
|
|
static int32
|
|
tidcmp(const void *a, const void *b)
|
|
{
|
|
ItemPointer iptr1 = ((const ItemPointer) a);
|
|
ItemPointer iptr2 = ((const ItemPointer) b);
|
|
|
|
return ItemPointerCompare(iptr1, iptr2);
|
|
}
|
|
|
|
/*-------------------------------------------------------------------------
|
|
* sanity_check_tid_array()
|
|
*
|
|
* Perform sanity checks on the given tid array, and set *ntids to the
|
|
* number of items in the array.
|
|
* ------------------------------------------------------------------------
|
|
*/
|
|
static void
|
|
sanity_check_tid_array(ArrayType *ta, int *ntids)
|
|
{
|
|
if (ARR_HASNULL(ta) && array_contains_nulls(ta))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
|
errmsg("array must not contain nulls")));
|
|
|
|
if (ARR_NDIM(ta) > 1)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_EXCEPTION),
|
|
errmsg("argument must be empty or one-dimensional array")));
|
|
|
|
*ntids = ArrayGetNItems(ARR_NDIM(ta), ARR_DIMS(ta));
|
|
}
|
|
|
|
/*-------------------------------------------------------------------------
|
|
* find_tids_one_page()
|
|
*
|
|
* Find all the tids residing in the same page as tids[next_start_ptr], and
|
|
* update next_start_ptr so that it points to the first tid in the next page.
|
|
*
|
|
* NOTE: The input tids[] array must be sorted.
|
|
* ------------------------------------------------------------------------
|
|
*/
|
|
static BlockNumber
|
|
find_tids_one_page(ItemPointer tids, int ntids, OffsetNumber *next_start_ptr)
|
|
{
|
|
int i;
|
|
BlockNumber prev_blkno,
|
|
blkno;
|
|
|
|
prev_blkno = blkno = InvalidBlockNumber;
|
|
|
|
for (i = *next_start_ptr; i < ntids; i++)
|
|
{
|
|
ItemPointerData tid = tids[i];
|
|
|
|
blkno = ItemPointerGetBlockNumberNoCheck(&tid);
|
|
|
|
if (i == *next_start_ptr)
|
|
prev_blkno = blkno;
|
|
|
|
if (prev_blkno != blkno)
|
|
break;
|
|
}
|
|
|
|
*next_start_ptr = i;
|
|
return prev_blkno;
|
|
}
|