PostgreSQL源码解读(91)-查询语句#76(ExecHashJoin函数#2)
本节是ExecHashJoin函数介绍的第二部分,主要介绍了ExecHashJoin中依赖的其他函数的实现逻辑,包括ExecHashTableCreate、ExecChooseHashTableSize等。
成都创新互联坚持“要么做到,要么别承诺”的工作理念,服务领域包括:做网站、成都网站建设、企业官网、英文网站、手机端网站、网站推广等服务,满足客户于互联网时代的呈贡网站设计、移动媒体设计的需求,帮助企业找到有效的互联网解决方案。努力成为您成熟可靠的网络建设合作伙伴!
一、数据结构
Plan
所有计划节点通过将Plan结构作为第一个字段从Plan结构“派生”。这确保了在将节点转换为计划节点时,一切都能正常工作。(在执行器中以通用方式传递时,节点指针经常被转换为Plan *)
/* ----------------
* Plan node
*
* All plan nodes "derive" from the Plan structure by having the
* Plan structure as the first field. This ensures that everything works
* when nodes are cast to Plan's. (node pointers are frequently cast to Plan*
* when passed around generically in the executor)
* 所有计划节点通过将Plan结构作为第一个字段从Plan结构“派生”。
* 这确保了在将节点转换为计划节点时,一切都能正常工作。
* (在执行器中以通用方式传递时,节点指针经常被转换为Plan *)
*
* We never actually instantiate any Plan nodes; this is just the common
* abstract superclass for all Plan-type nodes.
* 从未实例化任何Plan节点;这只是所有Plan-type节点的通用抽象超类。
* ----------------
*/
typedef struct Plan
{
NodeTag type;//节点类型
/*
* 成本估算信息;estimated execution costs for plan (see costsize.c for more info)
*/
Cost startup_cost; /* 启动成本;cost expended before fetching any tuples */
Cost total_cost; /* 总成本;total cost (assuming all tuples fetched) */
/*
* 优化器估算信息;planner's estimate of result size of this plan step
*/
double plan_rows; /* 行数;number of rows plan is expected to emit */
int plan_width; /* 平均行大小(Byte为单位);average row width in bytes */
/*
* 并行执行相关的信息;information needed for parallel query
*/
bool parallel_aware; /* 是否参与并行执行逻辑?engage parallel-aware logic? */
bool parallel_safe; /* 是否并行安全;OK to use as part of parallel plan? */
/*
* Plan类型节点通用的信息.Common structural data for all Plan types.
*/
int plan_node_id; /* unique across entire final plan tree */
List *targetlist; /* target list to be computed at this node */
List *qual; /* implicitly-ANDed qual conditions */
struct Plan *lefttree; /* input plan tree(s) */
struct Plan *righttree;
List *initPlan; /* Init Plan nodes (un-correlated expr
* subselects) */
/*
* Information for management of parameter-change-driven rescanning
* parameter-change-driven重扫描的管理信息.
*
* extParam includes the paramIDs of all external PARAM_EXEC params
* affecting this plan node or its children. setParam params from the
* node's initPlans are not included, but their extParams are.
*
* allParam includes all the extParam paramIDs, plus the IDs of local
* params that affect the node (i.e., the setParams of its initplans).
* These are _all_ the PARAM_EXEC params that affect this node.
*/
Bitmapset *extParam;
Bitmapset *allParam;
} Plan;
JoinState
Hash/NestLoop/Merge Join的基类
/* ----------------
* JoinState information
*
* Superclass for state nodes of join plans.
* Hash/NestLoop/Merge Join的基类
* ----------------
*/
typedef struct JoinState
{
PlanState ps;//基类PlanState
JoinType jointype;//连接类型
//在找到一个匹配inner tuple的时候,如需要跳转到下一个outer tuple,则该值为T
bool single_match; /* True if we should skip to next outer tuple
* after finding one inner match */
//连接条件表达式(除了ps.qual)
ExprState *joinqual; /* JOIN quals (in addition to ps.qual) */
} JoinState;
HashJoinState
Hash Join运行期状态结构体
/* these structs are defined in executor/hashjoin.h: */
typedef struct HashJoinTupleData *HashJoinTuple;
typedef struct HashJoinTableData *HashJoinTable;
typedef struct HashJoinState
{
JoinState js; /* 基类;its first field is NodeTag */
ExprState *hashclauses;//hash连接条件
List *hj_OuterHashKeys; /* 外表条件链表;list of ExprState nodes */
List *hj_InnerHashKeys; /* 内表连接条件;list of ExprState nodes */
List *hj_HashOperators; /* 操作符OIDs链表;list of operator OIDs */
HashJoinTable hj_HashTable;//Hash表
uint32 hj_CurHashValue;//当前的Hash值
int hj_CurBucketNo;//当前的bucket编号
int hj_CurSkewBucketNo;//行倾斜bucket编号
HashJoinTuple hj_CurTuple;//当前元组
TupleTableSlot *hj_OuterTupleSlot;//outer relation slot
TupleTableSlot *hj_HashTupleSlot;//Hash tuple slot
TupleTableSlot *hj_NullOuterTupleSlot;//用于外连接的outer虚拟slot
TupleTableSlot *hj_NullInnerTupleSlot;//用于外连接的inner虚拟slot
TupleTableSlot *hj_FirstOuterTupleSlot;//
int hj_JoinState;//JoinState状态
bool hj_MatchedOuter;//是否匹配
bool hj_OuterNotEmpty;//outer relation是否为空
} HashJoinState;
HashJoinTable
Hash表数据结构
typedef struct HashJoinTableData
{
int nbuckets; /* 内存中的hash桶数;# buckets in the in-memory hash table */
int log2_nbuckets; /* 2的对数(nbuckets必须是2的幂);its log2 (nbuckets must be a power of 2) */
int nbuckets_original; /* 首次hash时的桶数;# buckets when starting the first hash */
int nbuckets_optimal; /* 优化后的桶数(每个批次);optimal # buckets (per batch) */
int log2_nbuckets_optimal; /* 2的对数;log2(nbuckets_optimal) */
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
//bucket [i]是内存中第i个桶中的元组链表的head item
union
{
/* unshared array is per-batch storage, as are all the tuples */
//未共享数组是按批处理存储的,所有元组均如此
struct HashJoinTupleData **unshared;
/* shared array is per-query DSA area, as are all the tuples */
//共享数组是每个查询的DSA区域,所有元组均如此
dsa_pointer_atomic *shared;
} buckets;
bool keepNulls; /*如不匹配则存储NULL元组,该值为T;true to store unmatchable NULL tuples */
bool skewEnabled; /*是否使用倾斜优化?;are we using skew optimization? */
HashSkewBucket **skewBucket; /* 倾斜的hash表桶数;hashtable of skew buckets */
int skewBucketLen; /* skewBucket数组大小;size of skewBucket array (a power of 2!) */
int nSkewBuckets; /* 活动的倾斜桶数;number of active skew buckets */
int *skewBucketNums; /* 活动倾斜桶数组索引;array indexes of active skew buckets */
int nbatch; /* 批次数;number of batches */
int curbatch; /* 当前批次,第一轮为0;current batch #; 0 during 1st pass */
int nbatch_original; /* 在开始inner扫描时的批次;nbatch when we started inner scan */
int nbatch_outstart; /* 在开始outer扫描时的批次;nbatch when we started outer scan */
bool growEnabled; /* 关闭nbatch增加的标记;flag to shut off nbatch increases */
double totalTuples; /* 从inner plan获得的元组数;# tuples obtained from inner plan */
double partialTuples; /* 通过hashjoin获得的inner元组数;# tuples obtained from inner plan by me */
double skewTuples; /* 倾斜元组数;# tuples inserted into skew tuples */
/*
* These arrays are allocated for the life of the hash join, but only if
* nbatch > 1. A file is opened only when we first write a tuple into it
* (otherwise its pointer remains NULL). Note that the zero'th array
* elements never get used, since we will process rather than dump out any
* tuples of batch zero.
* 这些数组在散列连接的生命周期内分配,但仅当nbatch > 1时分配。
* 只有当第一次将元组写入文件时,文件才会打开(否则它的指针将保持NULL)。
* 注意,第0个数组元素永远不会被使用,因为批次0的元组永远不会转储.
*/
BufFile **innerBatchFile; /* 每个批次的inner虚拟临时文件缓存;buffered virtual temp file per batch */
BufFile **outerBatchFile; /* 每个批次的outer虚拟临时文件缓存;buffered virtual temp file per batch */
/*
* Info about the datatype-specific hash functions for the datatypes being
* hashed. These are arrays of the same length as the number of hash join
* clauses (hash keys).
* 有关正在散列的数据类型的特定于数据类型的散列函数的信息。
* 这些数组的长度与散列连接子句(散列键)的数量相同。
*/
FmgrInfo *outer_hashfunctions; /* outer hash函数FmgrInfo结构体;lookup data for hash functions */
FmgrInfo *inner_hashfunctions; /* inner hash函数FmgrInfo结构体;lookup data for hash functions */
bool *hashStrict; /* 每个hash操作符是严格?is each hash join operator strict? */
Size spaceUsed; /* 元组使用的当前内存空间大小;memory space currently used by tuples */
Size spaceAllowed; /* 空间使用上限;upper limit for space used */
Size spacePeak; /* 峰值的空间使用;peak space used */
Size spaceUsedSkew; /* 倾斜哈希表的当前空间使用情况;skew hash table's current space usage */
Size spaceAllowedSkew; /* 倾斜哈希表的使用上限;upper limit for skew hashtable */
MemoryContext hashCxt; /* 整个散列连接存储的上下文;context for whole-hash-join storage */
MemoryContext batchCxt; /* 该批次存储的上下文;context for this-batch-only storage */
/* used for dense allocation of tuples (into linked chunks) */
//用于密集分配元组(到链接块中)
HashMemoryChunk chunks; /* 整个批次使用一个链表;one list for the whole batch */
/* Shared and private state for Parallel Hash. */
//并行hash使用的共享和私有状态
HashMemoryChunk current_chunk; /* 后台进程的当前chunk;this backend's current chunk */
dsa_area *area; /* 用于分配内存的DSA区域;DSA area to allocate memory from */
ParallelHashJoinState *parallel_state;//并行执行状态
ParallelHashJoinBatchAccessor *batches;//并行访问器
dsa_pointer current_chunk_shared;//当前chunk的开始指针
} HashJoinTableData;
typedef struct HashJoinTableData *HashJoinTable;
HashJoinTupleData
Hash连接元组数据
/* ----------------------------------------------------------------
* hash-join hash table structures
*
* Each active hashjoin has a HashJoinTable control block, which is
* palloc'd in the executor's per-query context. All other storage needed
* for the hashjoin is kept in private memory contexts, two for each hashjoin.
* This makes it easy and fast to release the storage when we don't need it
* anymore. (Exception: data associated with the temp files lives in the
* per-query context too, since we always call buffile.c in that context.)
* 每个活动的hashjoin都有一个可散列的控制块,它在执行程序的每个查询上下文中都是通过palloc分配的。
* hashjoin所需的所有其他存储都保存在私有内存上下文中,每个hashjoin有两个。
* 当不再需要它的时候,这使得释放它变得简单和快速。
* (例外:与临时文件相关的数据也存在于每个查询上下文中,因为在这种情况下总是调用buffile.c。)
*
* The hashtable contexts are made children of the per-query context, ensuring
* that they will be discarded at end of statement even if the join is
* aborted early by an error. (Likewise, any temporary files we make will
* be cleaned up by the virtual file manager in event of an error.)
* hashtable上下文是每个查询上下文的子上下文,确保在语句结束时丢弃它们,即使连接因错误而提前中止。
* (同样,如果出现错误,虚拟文件管理器将清理创建的任何临时文件。)
*
* Storage that should live through the entire join is allocated from the
* "hashCxt", while storage that is only wanted for the current batch is
* allocated in the "batchCxt". By resetting the batchCxt at the end of
* each batch, we free all the per-batch storage reliably and without tedium.
* 通过整个连接的存储空间应从“hashCxt”分配,而只需要当前批处理的存储空间在“batchCxt”中分配。
* 通过在每个批处理结束时重置batchCxt,可以可靠地释放每个批处理的所有存储,而不会感到单调乏味。
*
* During first scan of inner relation, we get its tuples from executor.
* If nbatch > 1 then tuples that don't belong in first batch get saved
* into inner-batch temp files. The same statements apply for the
* first scan of the outer relation, except we write tuples to outer-batch
* temp files. After finishing the first scan, we do the following for
* each remaining batch:
* 1. Read tuples from inner batch file, load into hash buckets.
* 2. Read tuples from outer batch file, match to hash buckets and output.
* 在内部关系的第一次扫描中,从执行者那里得到了它的元组。
* 如果nbatch > 1,那么不属于第一批的元组将保存到批内临时文件中。
* 相同的语句适用于外关系的第一次扫描,但是我们将元组写入外部批处理临时文件。
* 完成第一次扫描后,我们对每批剩余的元组做如下处理:
* 1.从内部批处理文件读取元组,加载到散列桶中。
* 2.从外部批处理文件读取元组,匹配哈希桶和输出。
*
* It is possible to increase nbatch on the fly if the in-memory hash table
* gets too big. The hash-value-to-batch computation is arranged so that this
* can only cause a tuple to go into a later batch than previously thought,
* never into an earlier batch. When we increase nbatch, we rescan the hash
* table and dump out any tuples that are now of a later batch to the correct
* inner batch file. Subsequently, while reading either inner or outer batch
* files, we might find tuples that no longer belong to the current batch;
* if so, we just dump them out to the correct batch file.
* 如果内存中的哈希表太大,可以动态增加nbatch。
* 散列值到批处理的计算是这样安排的:
* 这只会导致元组进入比以前认为的更晚的批处理,而不会进入更早的批处理。
* 当增加nbatch时,重新扫描哈希表,并将现在属于后面批处理的任何元组转储到正确的内部批处理文件。
* 随后,在读取内部或外部批处理文件时,可能会发现不再属于当前批处理的元组;
* 如果是这样,只需将它们转储到正确的批处理文件即可。
* ----------------------------------------------------------------
*/
/* these are in nodes/execnodes.h: */
/* typedef struct HashJoinTupleData *HashJoinTuple; */
/* typedef struct HashJoinTableData *HashJoinTable; */
typedef struct HashJoinTupleData
{
/* link to next tuple in same bucket */
//link同一个桶中的下一个元组
union
{
struct HashJoinTupleData *unshared;
dsa_pointer shared;
} next;
uint32 hashvalue; /* 元组的hash值;tuple's hash code */
/* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
} HashJoinTupleData;
#define HJTUPLE_OVERHEAD MAXALIGN(sizeof(HashJoinTupleData))
#define HJTUPLE_MINTUPLE(hjtup) \
((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
二、源码解读
ExecHashTableCreate
ExecHashTableCreate函数初始化hashjoin需要使用的hashtable.
/*----------------------------------------------------------------------------------------------------
HJ_BUILD_HASHTABLE 阶段
-----------------------------------------------------------------------------------------------------*/
/* ----------------
* these are defined to avoid confusion problems with "left"
* and "right" and "inner" and "outer". The convention is that
* the "left" plan is the "outer" plan and the "right" plan is
* the inner plan, but these make the code more readable.
* 这些定义是为了避免“左”和“右”以及“内”和“外”的混淆问题。
* 约定是,“左”计划是“外部”计划,“右”计划是内部计划,但是这些计划使代码更具可读性。
* ----------------
*/
#define innerPlan(node) (((Plan *)(node))->righttree)
#define outerPlan(node) (((Plan *)(node))->lefttree)
/* ----------------------------------------------------------------
* ExecHashTableCreate
*
* create an empty hashtable data structure for hashjoin.
* 初始化hashjoin需要使用的hashtable.
* ----------------------------------------------------------------
*/
HashJoinTable
ExecHashTableCreate(HashState *state, List *hashOperators, bool keepNulls)
{
Hash *node;
HashJoinTable hashtable;
Plan *outerNode;
size_t space_allowed;
int nbuckets;
int nbatch;
double rows;
int num_skew_mcvs;
int log2_nbuckets;
int nkeys;
int i;
ListCell *ho;
MemoryContext oldcxt;
/*
* Get information about the size of the relation to be hashed (it's the
* "outer" subtree of this node, but the inner relation of the hashjoin).
* Compute the appropriate size of the hash table.
* 获取有关要散列的关系大小的信息(它是该节点的“outer”子树,hashjoin的inner relation)。
* 计算哈希表的适当大小。
*/
node = (Hash *) state->ps.plan;//获取Hash节点
outerNode = outerPlan(node);//获取outer relation Plan节点
/*
* If this is shared hash table with a partial plan, then we can't use
* outerNode->plan_rows to estimate its size. We need an estimate of the
* total number of rows across all copies of the partial plan.
* 如果这是带有部分计划(并行处理)的共享哈希表,那么不能使用outerNode->plan_rows来估计它的大小。
* 需要估算跨部分计划的所有副本的行总数。
*/
rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;//获取总行数
ExecChooseHashTableSize(rows, outerNode->plan_width,
OidIsValid(node->skewTable),
state->parallel_state != NULL,
state->parallel_state != NULL ?
state->parallel_state->nparticipants - 1 : 0,
&space_allowed,
&nbuckets, &nbatch, &num_skew_mcvs);//计算Hash Table的大小尺寸
/* nbuckets must be a power of 2 */
//nbuckets(hash桶数)必须是2的n次方
log2_nbuckets = my_log2(nbuckets);
Assert(nbuckets == (1 << log2_nbuckets));
/*
* Initialize the hash table control block.
* 初始化hash表的控制块
*
* The hashtable control block is just palloc'd from the executor's
* per-query memory context. Everything else should be kept inside the
* subsidiary hashCxt or batchCxt.
* hashtable控件块是从执行程序的每个查询内存上下文中调取的。
* 其他内容都应该保存在附属hashCxt或batchCxt中。
*/
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));//分配内存
hashtable->nbuckets = nbuckets;//桶数
hashtable->nbuckets_original = nbuckets;
hashtable->nbuckets_optimal = nbuckets;
hashtable->log2_nbuckets = log2_nbuckets;
hashtable->log2_nbuckets_optimal = log2_nbuckets;
hashtable->buckets.unshared = NULL;
hashtable->keepNulls = keepNulls;
hashtable->skewEnabled = false;
hashtable->skewBucket = NULL;
hashtable->skewBucketLen = 0;
hashtable->nSkewBuckets = 0;
hashtable->skewBucketNums = NULL;
hashtable->nbatch = nbatch;
hashtable->curbatch = 0;
hashtable->nbatch_original = nbatch;
hashtable->nbatch_outstart = nbatch;
hashtable->growEnabled = true;
hashtable->totalTuples = 0;
hashtable->partialTuples = 0;
hashtable->skewTuples = 0;
hashtable->innerBatchFile = NULL;
hashtable->outerBatchFile = NULL;
hashtable->spaceUsed = 0;
hashtable->spacePeak = 0;
hashtable->spaceAllowed = space_allowed;
hashtable->spaceUsedSkew = 0;
hashtable->spaceAllowedSkew =
hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
hashtable->chunks = NULL;
hashtable->current_chunk = NULL;
hashtable->parallel_state = state->parallel_state;
hashtable->area = state->ps.state->es_query_dsa;
hashtable->batches = NULL;
#ifdef HJDEBUG
printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",
hashtable, nbatch, nbuckets);
#endif
/*
* Create temporary memory contexts in which to keep the hashtable working
* storage. See notes in executor/hashjoin.h.
* 创建临时内存上下文,以便在其中保持散列表的相关信息。
* 参见executor/hashjoin.h中的注释。
*/
hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
"HashTableContext",
ALLOCSET_DEFAULT_SIZES);
hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
"HashBatchContext",
ALLOCSET_DEFAULT_SIZES);
/* Allocate data that will live for the life of the hashjoin */
//分配内存,切换至hashCxt
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
/*
* Get info about the hash functions to be used for each hash key. Also
* remember whether the join operators are strict.
* 获取关于每个散列键要使用的散列函数的信息。
* 还要记住连接操作符是否严格。
*/
nkeys = list_length(hashOperators);//键值数
hashtable->outer_hashfunctions =
(FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));//outer relation所使用的hash函数
hashtable->inner_hashfunctions =
(FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));//inner relation所使用的hash函数
hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));//是否严格的操作符
i = 0;
foreach(ho, hashOperators)//遍历hash操作符
{
Oid hashop = lfirst_oid(ho);//hash操作符
Oid left_hashfn;//左函数
Oid right_hashfn;//右函数
//获取与给定操作符兼容的标准哈希函数的OID,并根据需要对其LHS和/或RHS数据类型进行操作。
if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))//获取hash函数
elog(ERROR, "could not find hash function for hash operator %u",
hashop);
fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
hashtable->hashStrict[i] = op_strict(hashop);
i++;
}
if (nbatch > 1 && hashtable->parallel_state == NULL)//批次>1而且并行状态为NULL
{
/*
* allocate and initialize the file arrays in hashCxt (not needed for
* parallel case which uses shared tuplestores instead of raw files)
* 在hashCxt中分配和初始化文件数组(对于使用共享tuplestore而不是原始文件的并行情况不需要)
*/
hashtable->innerBatchFile = (BufFile **)
palloc0(nbatch * sizeof(BufFile *));//用于缓存该批次的inner relation的tuple
hashtable->outerBatchFile = (BufFile **)
palloc0(nbatch * sizeof(BufFile *));//用于缓存该批次的outerr relation的tuple
/* The files will not be opened until needed... */
/* ... but make sure we have temp tablespaces established for them */
//这些文件需要时才会打开……
//…但是要确保为它们建立了临时表空间
PrepareTempTablespaces();
}
MemoryContextSwitchTo(oldcxt);//切换回原内存上下文
if (hashtable->parallel_state)//并行处理
{
ParallelHashJoinState *pstate = hashtable->parallel_state;
Barrier *build_barrier;
/*
* Attach to the build barrier. The corresponding detach operation is
* in ExecHashTableDetach. Note that we won't attach to the
* batch_barrier for batch 0 yet. We'll attach later and start it out
* in PHJ_BATCH_PROBING phase, because batch 0 is allocated up front
* and then loaded while hashing (the standard hybrid hash join
* algorithm), and we'll coordinate that using build_barrier.
*/
build_barrier = &pstate->build_barrier;
BarrierAttach(build_barrier);
/*
* So far we have no idea whether there are any other participants,
* and if so, what phase they are working on. The only thing we care
* about at this point is whether someone has already created the
* SharedHashJoinBatch objects and the hash table for batch 0. One
* backend will be elected to do that now if necessary.
*/
if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECTING &&
BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECTING))
{
pstate->nbatch = nbatch;
pstate->space_allowed = space_allowed;
pstate->growth = PHJ_GROWTH_OK;
/* Set up the shared state for coordinating batches. */
ExecParallelHashJoinSetUpBatches(hashtable, nbatch);
/*
* Allocate batch 0's hash table up front so we can load it
* directly while hashing.
*/
pstate->nbuckets = nbuckets;
ExecParallelHashTableAlloc(hashtable, 0);
}
/*
* The next Parallel Hash synchronization point is in
* MultiExecParallelHash(), which will progress it all the way to
* PHJ_BUILD_DONE. The caller must not return control from this
* executor node between now and then.
*/
}
else//非并行处理
{
/*
* Prepare context for the first-scan space allocations; allocate the
* hashbucket array therein, and set each bucket "empty".
* 为第一次扫描空间分配准备上下文;在其中分配hashbucket数组,并将每个bucket设置为“空”。
*/
MemoryContextSwitchTo(hashtable->batchCxt);//切换上下文
hashtable->buckets.unshared = (HashJoinTuple *)
palloc0(nbuckets * sizeof(HashJoinTuple));//分配内存空间
/*
* Set up for skew optimization, if possible and there's a need for
* more than one batch. (In a one-batch join, there's no point in
* it.)
* 如需要多个批处理,设置倾斜优化。(在单批处理连接中,这是没有意义的。)
*/
if (nbatch > 1)
ExecHashBuildSkewHash(hashtable, node, num_skew_mcvs);
MemoryContextSwitchTo(oldcxt);//切换上下文
}
return hashtable;//返回Hash表
}
/*
* This routine fills a FmgrInfo struct, given the OID
* of the function to be called.
* 给定要调用的函数的OID,这个例程填充一个FmgrInfo结构体。
*
* The caller's CurrentMemoryContext is used as the fn_mcxt of the info
* struct; this means that any subsidiary data attached to the info struct
* (either by fmgr_info itself, or later on by a function call handler)
* will be allocated in that context. The caller must ensure that this
* context is at least as long-lived as the info struct itself. This is
* not a problem in typical cases where the info struct is on the stack or
* in freshly-palloc'd space. However, if one intends to store an info
* struct in a long-lived table, it's better to use fmgr_info_cxt.
* 调用方的CurrentMemoryContext用作info结构体的fn_mcxt;
* 这意味着附加到info结构体的任何附属数据(通过fmgr_info本身,或者稍后通过函数调用处理程序)将在该上下文中分配。
* 调用者必须确保这个上下文的生命周期至少与info结构本身一样。
* 在信息结构位于堆栈上或在新palloc空间中的典型情况下,这不是一个问题。
* 但是,如果希望在long-lived表中存储信息结构,最好使用fmgr_info_cxt。
*/
void
fmgr_info(Oid functionId, FmgrInfo *finfo)
{
fmgr_info_cxt_security(functionId, finfo, CurrentMemoryContext, false);
}
ExecChooseHashTableSize
ExecChooseHashTableSize函数根据给定要散列的关系的估计大小(行数和平均行宽),计算适当的散列表大小。
/*
* Compute appropriate size for hashtable given the estimated size of the
* relation to be hashed (number of rows and average row width).
* 给定要散列的关系的估计大小(行数和平均行宽),计算适当的散列表大小。
*
* This is exported so that the planner's costsize.c can use it.
* 这些信息已导出以便计划器costsize.c可以使用
*/
/* Target bucket loading (tuples per bucket) */
#define NTUP_PER_BUCKET 1
void
ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
bool try_combined_work_mem,
int parallel_workers,
size_t *space_allowed,
int *numbuckets,
int *numbatches,
int *num_skew_mcvs)
{
int tupsize;//元组大小
double inner_rel_bytes;//inner relation大小
long bucket_bytes;//桶大小
long hash_table_bytes;//hash table大小
long skew_table_bytes;//倾斜表大小
long max_pointers;//最大的指针数
long mppow2;//
int nbatch = 1;//批次
int nbuckets;//桶数
double dbuckets;//
/* Force a plausible relation size if no info */
//如relation大小没有信息,则设定为默认值1000.0
if (ntuples <= 0.0)
ntuples = 1000.0;
/*
* Estimate tupsize based on footprint of tuple in hashtable... note this
* does not allow for any palloc overhead. The manipulations of spaceUsed
* don't count palloc overhead either.
* 根据哈希表中tuple的占用空间估计tupsize…
* 注意,这不允许任何palloc开销。使用的空间操作也不包括palloc开销。
*/
tupsize = HJTUPLE_OVERHEAD +
MAXALIGN(SizeofMinimalTupleHeader) +
MAXALIGN(tupwidth);//估算元组大小
inner_rel_bytes = ntuples * tupsize;//inner relation大小
/*
* Target in-memory hashtable size is work_mem kilobytes.
* 目标内存中的散列表大小为work_mem KB。
*/
hash_table_bytes = work_mem * 1024L;
/*
* Parallel Hash tries to use the combined work_mem of all workers to
* avoid the need to batch. If that won't work, it falls back to work_mem
* per worker and tries to process batches in parallel.
* 并行散列试图使用所有worker的所有work_mem来避免分批处理。
* 如果这不起作用,它将返回到每个worker的work_mem,并尝试并行处理批处理。
*/
if (try_combined_work_mem)//尝试融合work_mem
hash_table_bytes += hash_table_bytes * parallel_workers;
*space_allowed = hash_table_bytes;
/*
* If skew optimization is possible, estimate the number of skew buckets
* that will fit in the memory allowed, and decrement the assumed space
* available for the main hash table accordingly.
* 如果可以进行倾斜优化,估算允许内存中容纳的倾斜桶的数量,并相应地减少主哈希表的假定可用空间。
*
* We make the optimistic assumption that each skew bucket will contain
* one inner-relation tuple. If that turns out to be low, we will recover
* at runtime by reducing the number of skew buckets.
* 我们乐观地假设,每个倾斜桶将包含一个内部关系元组。
* 如果结果很低,将通过减少倾斜桶的数量在运行时进行恢复。
*
* hashtable->skewBucket will have up to 8 times as many HashSkewBucket
* pointers as the number of MCVs we allow, since ExecHashBuildSkewHash
* will round up to the next power of 2 and then multiply by 4 to reduce
* collisions.
* hashtable->skewBucket的指针数量将是允许的mcv数量的8倍,
* 因为ExecHashBuildSkewHash将四舍五入到下一个2次方,然后乘以4以减少冲突。
*/
if (useskew)
{
//倾斜优化
skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
/*----------
* Divisor is:
* size of a hash tuple +
* worst-case size of skewBucket[] per MCV +
* size of skewBucketNums[] entry +
* size of skew bucket struct itself
*----------
*/
*num_skew_mcvs = skew_table_bytes / (tupsize +
(8 * sizeof(HashSkewBucket *)) +
sizeof(int) +
SKEW_BUCKET_OVERHEAD);
if (*num_skew_mcvs > 0)
hash_table_bytes -= skew_table_bytes;
}
else
*num_skew_mcvs = 0;//不使用倾斜优化,默认为0
/*
* Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
* memory is filled, assuming a single batch; but limit the value so that
* the pointer arrays we'll try to allocate do not exceed work_mem nor
* MaxAllocSize.
* 设置nbuckets,假设为单批处理,当内存被填满时,实现NTUP_PER_BUCKET的平均桶负载;
* 但是要限制这个值,以便试图分配的指针数组不会超过work_mem或MaxAllocSize。
*
* Note that both nbuckets and nbatch must be powers of 2 to make
* ExecHashGetBucketAndBatch fast.
* 注意,nbucket和nbatch都必须是2的幂,才能使ExecHashGetBucketAndBatch更快。
*/
max_pointers = *space_allowed / sizeof(HashJoinTuple);//最大指针数
max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));//控制上限
/* If max_pointers isn't a power of 2, must round it down to one */
//如果max_pointer不是2的幂,则必须四舍五入到符合规则的某个值(如110.1 --> 128)
mppow2 = 1L << my_log2(max_pointers);
if (max_pointers != mppow2)
max_pointers = mppow2 / 2;
/* Also ensure we avoid integer overflow in nbatch and nbuckets */
/* (this step is redundant given the current value of MaxAllocSize) */
//还要确保在nbatch和nbucket中避免整数溢出
//(鉴于MaxAllocSize的当前值,此步骤是多余的)
max_pointers = Min(max_pointers, INT_MAX / 2);//设定上限
dbuckets = ceil(ntuples / NTUP_PER_BUCKET);//取整
dbuckets = Min(dbuckets, max_pointers);//设定上限
nbuckets = (int) dbuckets;//桶数
/* don't let nbuckets be really small, though ... */
//但是,不要让nbucket非常小……
nbuckets = Max(nbuckets, 1024);//设定下限(1024)
/* ... and force it to be a power of 2. */
//2的幂
nbuckets = 1 << my_log2(nbuckets);
/*
* If there's not enough space to store the projected number of tuples and
* the required bucket headers, we will need multiple batches.
* 如果没有足够的空间来存储预计的元组数量和所需的bucket headers,将需要多个批处理。
*/
bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
if (inner_rel_bytes + bucket_bytes > hash_table_bytes)//inner relation大小 + 桶数大于可用空间
{
/* We'll need multiple batches */
//需要多批次
long lbuckets;
double dbatch;
int minbatch;
long bucket_size;
/*
* If Parallel Hash with combined work_mem would still need multiple
* batches, we'll have to fall back to regular work_mem budget.
* 如果合并了work_mem的并行散列仍然需要多个批处理,将不得不回到常规的work_mem预算。
*/
if (try_combined_work_mem)
{
ExecChooseHashTableSize(ntuples, tupwidth, useskew,
false, parallel_workers,
space_allowed,
numbuckets,
numbatches,
num_skew_mcvs);
return;
}
/*
* Estimate the number of buckets we'll want to have when work_mem is
* entirely full. Each bucket will contain a bucket pointer plus
* NTUP_PER_BUCKET tuples, whose projected size already includes
* overhead for the hash code, pointer to the next tuple, etc.
* 估计work_mem完全用完时需要的桶数。
* 每个桶将包含一个桶指针和NTUP_PER_BUCKET元组,
* 其投影大小已经包括哈希码的开销、指向下一个元组的指针等等。
*/
bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));//桶大小
lbuckets = 1L << my_log2(hash_table_bytes / bucket_size);
lbuckets = Min(lbuckets, max_pointers);
nbuckets = (int) lbuckets;
nbuckets = 1 << my_log2(nbuckets);
bucket_bytes = nbuckets * sizeof(HashJoinTuple);
/*
* Buckets are simple pointers to hashjoin tuples, while tupsize
* includes the pointer, hash code, and MinimalTupleData. So buckets
* should never really exceed 25% of work_mem (even for
* NTUP_PER_BUCKET=1); except maybe for work_mem values that are not
* 2^N bytes, where we might get more because of doubling. So let's
* look for 50% here.
* Buckets是指向hashjoin元组的简单指针,而tupsize包含指针、散列代码和MinimalTupleData。
* 所以Buckets的实际大小不应该超过work_mem的25%(即使对于NTUP_PER_BUCKET=1);
* 除了work_mem值不是2 ^ N个字节这个原因外,翻倍可能会得到更多的,这里试着使用50%
*/
Assert(bucket_bytes <= hash_table_bytes / 2);
/* Calculate required number of batches. */
//计算批次数
dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
dbatch = Min(dbatch, max_pointers);
minbatch = (int) dbatch;
nbatch = 2;
while (nbatch < minbatch)
nbatch <<= 1;
}
Assert(nbuckets > 0);
Assert(nbatch > 0);
*numbuckets = nbuckets;
*numbatches = nbatch;
}
三、跟踪分析
测试脚本如下
testdb=# set enable_nestloop=false;
SET
testdb=# set enable_mergejoin=false;
SET
testdb=# explain verbose select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je
testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je
testdb(# from t_grxx gr inner join t_jfxx jf
testdb(# on gr.dwbh = dw.dwbh
testdb(# and gr.grbh = jf.grbh) grjf
testdb-# order by dw.dwbh;
QUERY PLAN
-----------------------------------------------------------------------------------------------
Sort (cost=14828.83..15078.46 rows=99850 width=47)
Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
Sort Key: dw.dwbh
-> Hash Join (cost=3176.00..6537.55 rows=99850 width=47)
Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je
Hash Cond: ((gr.grbh)::text = (jf.grbh)::text)
-> Hash Join (cost=289.00..2277.61 rows=99850 width=32)
Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm
Inner Unique: true
Hash Cond: ((gr.dwbh)::text = (dw.dwbh)::text)
-> Seq Scan on public.t_grxx gr (cost=0.00..1726.00 rows=100000 width=16)
Output: gr.dwbh, gr.grbh, gr.xm, gr.xb, gr.nl
-> Hash (cost=164.00..164.00 rows=10000 width=20)
Output: dw.dwmc, dw.dwbh, dw.dwdz
-> Seq Scan on public.t_dwxx dw (cost=0.00..164.00 rows=10000 width=20)
Output: dw.dwmc, dw.dwbh, dw.dwdz
-> Hash (cost=1637.00..1637.00 rows=100000 width=20)
Output: jf.ny, jf.je, jf.grbh
-> Seq Scan on public.t_jfxx jf (cost=0.00..1637.00 rows=100000 width=20)
Output: jf.ny, jf.je, jf.grbh
(20 rows)
启动gdb,设置断点,进入ExecHashTableCreate
(gdb) b ExecHashTableCreate
Breakpoint 1 at 0x6fc75d: file nodeHash.c, line 449.
(gdb) c
Continuing.
Breakpoint 1, ExecHashTableCreate (state=0x1e3cbc8, hashOperators=0x1e59890, keepNulls=false) at nodeHash.c:449
449 node = (Hash *) state->ps.plan;
获取相关信息
449 node = (Hash *) state->ps.plan;
(gdb) n
450 outerNode = outerPlan(node);
(gdb)
457 rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;
(gdb)
462 state->parallel_state != NULL ?
(gdb)
459 ExecChooseHashTableSize(rows, outerNode->plan_width,
(gdb)
获取Hash节点;
outer节点为顺序扫描SeqScan节点
inner(构造hash表的relation)行数为10000
(gdb) p *node
$1 = {plan = {type = T_Hash, startup_cost = 164, total_cost = 164, plan_rows = 10000, plan_width = 20,
parallel_aware = false, parallel_safe = true, plan_node_id = 4, targetlist = 0x1e4bf90, qual = 0x0,
lefttree = 0x1e493e8, righttree = 0x0, initPlan = 0x0, extParam = 0x0, allParam = 0x0}, skewTable = 16977,
skewColumn = 1, skewInherit = false, rows_total = 0}
(gdb) p *outerNode
$2 = {type = T_SeqScan, startup_cost = 0, total_cost = 164, plan_rows = 10000, plan_width = 20, parallel_aware = false,
parallel_safe = true, plan_node_id = 5, targetlist = 0x1e492b0, qual = 0x0, lefttree = 0x0, righttree = 0x0,
initPlan = 0x0, extParam = 0x0, allParam = 0x0}
(gdb) p rows
$3 = 10000
(gdb)
进入ExecChooseHashTableSize函数
(gdb) step
ExecChooseHashTableSize (ntuples=10000, tupwidth=20, useskew=true, try_combined_work_mem=false, parallel_workers=0,
space_allowed=0x7ffdcf148540, numbuckets=0x7ffdcf14853c, numbatches=0x7ffdcf148538, num_skew_mcvs=0x7ffdcf148534)
at nodeHash.c:677
677 int nbatch = 1;
ExecChooseHashTableSize->计算元组大小(56B)/inner relation大小(约560K)/hash表空间(16M)
(gdb) n
682 if (ntuples <= 0.0)
(gdb)
690 tupsize = HJTUPLE_OVERHEAD +
(gdb)
693 inner_rel_bytes = ntuples * tupsize;
(gdb)
698 hash_table_bytes = work_mem * 1024L;
(gdb)
705 if (try_combined_work_mem)
(gdb) p tupsize
$4 = 56
(gdb) p inner_rel_bytes
$5 = 560000
(gdb) p hash_table_bytes
$6 = 16777216
ExecChooseHashTableSize->使用数据倾斜优化(所需空间从Hash Table中获取)
(gdb) n
708 *space_allowed = hash_table_bytes;
(gdb)
724 if (useskew)
(gdb)
726 skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
(gdb) p useskew
$8 = true
(gdb) p hash_table_bytes
$9 = 16441672
(gdb) p skew_table_bytes
$10 = 335544
(gdb) p num_skew_mcvs
$11 = (int *) 0x7ffdcf148534
(gdb) p *num_skew_mcvs
$12 = 2396
(gdb)
ExecChooseHashTableSize->获取最大指针数目(2097152)
(gdb) n
756 max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));
(gdb)
758 mppow2 = 1L << my_log2(max_pointers);
(gdb) n
759 if (max_pointers != mppow2)
(gdb) p max_pointers
$13 = 2097152
(gdb) p mppow2
$15 = 2097152
ExecChooseHashTableSize->计算Hash桶数
(gdb) n
764 max_pointers = Min(max_pointers, INT_MAX / 2);
(gdb)
766 dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
(gdb)
767 dbuckets = Min(dbuckets, max_pointers);
(gdb)
768 nbuckets = (int) dbuckets;
(gdb)
770 nbuckets = Max(nbuckets, 1024);
(gdb)
772 nbuckets = 1 << my_log2(nbuckets);
(gdb)
778 bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
(gdb) n
779 if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
(gdb)
834 Assert(nbuckets > 0);
(gdb) p dbuckets
$16 = 10000
(gdb) p nbuckets
$17 = 16384
(gdb) p bucket_bytes
$18 = 131072
ExecChooseHashTableSize->只需要一个批次,赋值,返回
835 Assert(nbatch > 0);
(gdb)
837 *numbuckets = nbuckets;
(gdb)
838 *numbatches = nbatch;
(gdb)
839 }
(gdb)
(gdb)
ExecHashTableCreate (state=0x1e3cbc8, hashOperators=0x1e59890, keepNulls=false) at nodeHash.c:468
468 log2_nbuckets = my_log2(nbuckets);
初始化Hash表
468 log2_nbuckets = my_log2(nbuckets);
(gdb) p nbuckets
$19 = 16384
(gdb) n
469 Assert(nbuckets == (1 << log2_nbuckets));
(gdb)
478 hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
(gdb)
479 hashtable->nbuckets = nbuckets;
...
分配内存上下文
...
(gdb)
522 hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,
(gdb)
526 hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,
(gdb)
532 oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
(gdb)
切换上下文,并初始化hash函数
(gdb)
532 oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
(gdb) n
538 nkeys = list_length(hashOperators);
(gdb)
540 (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
(gdb) p nkeys
$20 = 1
(gdb) n
539 hashtable->outer_hashfunctions =
(gdb)
542 (FmgrInfo *) palloc(nkeys * sizeof(FmgrInfo));
(gdb)
541 hashtable->inner_hashfunctions =
(gdb)
543 hashtable->hashStrict = (bool *) palloc(nkeys * sizeof(bool));
(gdb)
544 i = 0;
初始化Hash操作符
(gdb) n
545 foreach(ho, hashOperators)
(gdb)
547 Oid hashop = lfirst_oid(ho);
(gdb)
551 if (!get_op_hash_functions(hashop, &left_hashfn, &right_hashfn))
(gdb)
554 fmgr_info(left_hashfn, &hashtable->outer_hashfunctions[i]);
(gdb)
555 fmgr_info(right_hashfn, &hashtable->inner_hashfunctions[i]);
(gdb)
556 hashtable->hashStrict[i] = op_strict(hashop);
(gdb)
557 i++;
(gdb)
545 foreach(ho, hashOperators)
(gdb) p *hashtable->hashStrict
$21 = true
(gdb) n
560 if (nbatch > 1 && hashtable->parallel_state == NULL)
分配hash桶内存空间
gdb) n
575 MemoryContextSwitchTo(oldcxt);
(gdb)
577 if (hashtable->parallel_state)
(gdb)
631 MemoryContextSwitchTo(hashtable->batchCxt);
(gdb)
634 palloc0(nbuckets * sizeof(HashJoinTuple));
(gdb)
633 hashtable->buckets.unshared = (HashJoinTuple *)
(gdb) p nbuckets
$23 = 16384
构造完成,返回hash表
(gdb) n
641 if (nbatch > 1)
(gdb)
644 MemoryContextSwitchTo(oldcxt);
(gdb)
647 return hashtable;
(gdb)
648 }
(gdb)
ExecHashJoinImpl (pstate=0x1e3c048, parallel=false) at nodeHashjoin.c:282
282 node->hj_HashTable = hashtable;
(gdb)
DONE!
四、参考资料
Hash Joins: Past, Present and Future/PGCon 2017
A Look at How Postgres Executes a Tiny Join - Part 1
A Look at How Postgres Executes a Tiny Join - Part 2
Assignment 2 Symmetric Hash Join
名称栏目:PostgreSQL源码解读(91)-查询语句#76(ExecHashJoin函数#2)
网页地址:http://scyanting.com/article/igipsg.html