diff -cr --new-file pgsql/src/backend/commands/explain.c pgsql-hashdistinct/src/backend/commands/explain.c *** pgsql/src/backend/commands/explain.c 2006-04-08 14:49:52.000000000 -0400 --- pgsql-hashdistinct/src/backend/commands/explain.c 2006-08-02 12:30:20.000000000 -0400 *************** *** 560,565 **** --- 560,568 ---- case T_Unique: pname = "Unique"; break; + case T_HashDistinct: + pname = "HashDistinct"; + break; case T_SetOp: switch (((SetOp *) plan)->cmd) { diff -cr --new-file pgsql/src/backend/executor/execAmi.c pgsql-hashdistinct/src/backend/executor/execAmi.c *** pgsql/src/backend/executor/execAmi.c 2006-03-05 10:58:25.000000000 -0500 --- pgsql-hashdistinct/src/backend/executor/execAmi.c 2006-08-02 12:30:58.000000000 -0400 *************** *** 41,46 **** --- 41,47 ---- #include "executor/nodeSubqueryscan.h" #include "executor/nodeTidscan.h" #include "executor/nodeUnique.h" + #include "executor/nodeHashDistinct.h" /* *************** *** 178,183 **** --- 179,188 ---- case T_UniqueState: ExecReScanUnique((UniqueState *) node, exprCtxt); break; + + case T_HashDistinctState: + ExecReScanHashDistinct((HashDistinctState *) node, exprCtxt); + break; case T_HashState: ExecReScanHash((HashState *) node, exprCtxt); *************** *** 373,378 **** --- 378,386 ---- case T_Unique: return ExecSupportsBackwardScan(outerPlan(node)); + + case T_HashDistinct: + return ExecSupportsBackwardScan(outerPlan(node)); case T_Limit: return ExecSupportsBackwardScan(outerPlan(node)); diff -cr --new-file pgsql/src/backend/executor/execProcnode.c pgsql-hashdistinct/src/backend/executor/execProcnode.c *** pgsql/src/backend/executor/execProcnode.c 2006-05-30 10:01:57.000000000 -0400 --- pgsql-hashdistinct/src/backend/executor/execProcnode.c 2006-06-26 10:52:02.000000000 -0400 *************** *** 102,107 **** --- 102,108 ---- #include "executor/nodeSubqueryscan.h" #include "executor/nodeTidscan.h" #include "executor/nodeUnique.h" + #include "executor/nodeHashDistinct.h" #include "miscadmin.h" #include "tcop/tcopprot.h" *************** *** 240,245 **** --- 241,251 ---- result = (PlanState *) ExecInitUnique((Unique *) node, estate, eflags); break; + + case T_HashDistinct: + result = (PlanState *) ExecInitHashDistinct((HashDistinct *) node, + estate, eflags); + break; case T_Hash: result = (PlanState *) ExecInitHash((Hash *) node, *************** *** 403,408 **** --- 409,418 ---- case T_UniqueState: result = ExecUnique((UniqueState *) node); break; + + case T_HashDistinctState: + result = ExecHashDistinct((HashDistinctState *) node); + break; case T_HashState: result = ExecHash((HashState *) node); *************** *** 567,572 **** --- 577,585 ---- case T_Unique: return ExecCountSlotsUnique((Unique *) node); + case T_HashDistinct: + return ExecCountSlotsHashDistinct((HashDistinct *) node); + case T_Hash: return ExecCountSlotsHash((Hash *) node); *************** *** 707,712 **** --- 720,729 ---- case T_UniqueState: ExecEndUnique((UniqueState *) node); break; + + case T_HashDistinctState: + ExecEndHashDistinct((HashDistinctState *) node); + break; case T_HashState: ExecEndHash((HashState *) node); diff -cr --new-file pgsql/src/backend/executor/Makefile pgsql-hashdistinct/src/backend/executor/Makefile *** pgsql/src/backend/executor/Makefile 2005-04-19 18:35:11.000000000 -0400 --- pgsql-hashdistinct/src/backend/executor/Makefile 2006-06-22 14:05:37.000000000 -0400 *************** *** 19,25 **** nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeHash.o \ nodeHashjoin.o nodeIndexscan.o nodeMaterial.o nodeMergejoin.o \ nodeNestloop.o nodeFunctionscan.o nodeResult.o nodeSeqscan.o \ ! nodeSetOp.o nodeSort.o nodeUnique.o nodeLimit.o nodeGroup.o \ nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o tstoreReceiver.o spi.o all: SUBSYS.o --- 19,25 ---- nodeBitmapHeapscan.o nodeBitmapIndexscan.o nodeHash.o \ nodeHashjoin.o nodeIndexscan.o nodeMaterial.o nodeMergejoin.o \ nodeNestloop.o nodeFunctionscan.o nodeResult.o nodeSeqscan.o \ ! nodeSetOp.o nodeSort.o nodeUnique.o nodeHashDistinct.o nodeLimit.o nodeGroup.o \ nodeSubplan.o nodeSubqueryscan.o nodeTidscan.o tstoreReceiver.o spi.o all: SUBSYS.o diff -cr --new-file pgsql/src/backend/executor/nodeHashDistinct.c pgsql-hashdistinct/src/backend/executor/nodeHashDistinct.c *** pgsql/src/backend/executor/nodeHashDistinct.c 1969-12-31 19:00:00.000000000 -0500 --- pgsql-hashdistinct/src/backend/executor/nodeHashDistinct.c 2006-06-19 14:08:58.000000000 -0400 *************** *** 0 **** --- 1,242 ---- + /*------------------------------------------------------------------------- + * + * nodeHashDistinct.c + * Routines to handle unique'ing of queries where appropriate + * + * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL: pgsql/src/backend/executor/nodeUnique.c,v 1.52 2006/03/05 15:58:26 momjian Exp $ + * + *------------------------------------------------------------------------- + */ + /* + * INTERFACE ROUTINES + * ExecHashDistinct - generate a unique'd temporary relation + * ExecInitHashDistinct - initialize node and subnodes.. + * ExecEndHashDistinct - shutdown node and subnodes + * + * NOTES + * Assumes tuples returned from subplan arrive in + * sorted order. + */ + + #include "postgres.h" + + #include "access/heapam.h" + #include "executor/executor.h" + #include "executor/nodeHashDistinct.h" + #include "utils/memutils.h" + + + /* ---------------------------------------------------------------- + * ExecHashDistinct + * + * This is a very simple node which filters out duplicate + * tuples from a stream of sorted tuples from a subplan. + * ---------------------------------------------------------------- + */ + TupleTableSlot * /* return: a tuple or NULL */ + ExecHashDistinct(HashDistinctState *node) + { + TupleTableSlot *resultTupleSlot; + TupleTableSlot *slot; + PlanState *outerPlan; + bool isnew = FALSE; + + /* + * get information from the node + */ + outerPlan = outerPlanState(node); + resultTupleSlot = node->ps.ps_ResultTupleSlot; + + /* + * now loop, returning only non-duplicate tuples. We assume that the + * tuples arrive in sorted order so we can detect duplicates easily. + * + * We return the first tuple from each group of duplicates (or the last + * tuple of each group, when moving backwards). At either end of the + * subplan, clear the result slot so that we correctly return the + * first/last tuple when reversing direction. + */ + for (;;) + { + /* + * fetch a tuple from the outer subplan + */ + slot = ExecProcNode(outerPlan); + if (TupIsNull(slot)) + { + /* end of subplan; reset in case we change direction */ + ExecClearTuple(resultTupleSlot); + return NULL; + } + + /* + * Test if the new tuple already exists in the hash table + * If so then we loop back and fetch another new tuple from the + * subplan. + */ + LookupTupleHashEntry(node->hashtable, slot, &isnew); + if (isnew) + break; + } + + /* + * We have a new tuple different from the previous saved tuple (if any). + * Save it and return it. We must copy it because the source subplan + * won't guarantee that this source tuple is still accessible after + * fetching the next source tuple. + */ + return ExecCopySlot(resultTupleSlot, slot); + } + + /* ---------------------------------------------------------------- + * ExecInitHashDistinct + * + * This initializes the HashDistinct node state structures and + * the node's subplan. + * ---------------------------------------------------------------- + */ + HashDistinctState * + ExecInitHashDistinct(HashDistinct *node, EState *estate, int eflags) + { + HashDistinctState *distinctstate; + + /* check for unsupported flags */ + Assert(!(eflags & EXEC_FLAG_MARK)); + + /* + * create state structure + */ + distinctstate = makeNode(HashDistinctState); + distinctstate->ps.plan = (Plan *) node; + distinctstate->ps.state = estate; + + /* + * Miscellaneous initialization + * + * HashDistinct nodes have no ExprContext initialization because they never call + * ExecQual or ExecProject. But they do need a per-tuple memory context + * anyway for calling LookupTupleHashEntry. + * + * It also needs a long-lived memory context to contain the hash table. + */ + distinctstate->tempContext = + AllocSetContextCreate(CurrentMemoryContext, + "HashDistinct tempContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + distinctstate->tableContext = + AllocSetContextCreate(CurrentMemoryContext, + "HashDistinct tableContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + /* + * Tuple table initialization + */ + ExecInitResultTupleSlot(estate, &distinctstate->ps); + + /* + * then initialize outer plan + */ + outerPlanState(distinctstate) = ExecInitNode(outerPlan(node), estate, eflags); + + /* + * HashDistinct nodes do no projections, so initialize projection info for this + * node appropriately + */ + ExecAssignResultTypeFromTL(&distinctstate->ps); + distinctstate->ps.ps_ProjInfo = NULL; + + /* + * Precompute lookup data for hash table lookup + */ + execTuplesHashPrepare(ExecGetResultType(&distinctstate->ps), + node->numCols, + node->uniqColIdx, + &distinctstate->eqfunctions, + &distinctstate->hashfunctions); + + distinctstate->hashtable = BuildTupleHashTable(node->numCols, + node->uniqColIdx, + distinctstate->eqfunctions, + distinctstate->hashfunctions, + node->numDistinct, + sizeof(TupleHashEntryData), + distinctstate->tableContext, + distinctstate->tempContext); + + #define UNIQUE_NSLOTS 1 + + return distinctstate; + } + + int + ExecCountSlotsHashDistinct(HashDistinct *node) + { + return ExecCountSlotsNode(outerPlan(node)) + + ExecCountSlotsNode(innerPlan(node)) + + UNIQUE_NSLOTS; + } + + /* ---------------------------------------------------------------- + * ExecEndHashDistinct + * + * This shuts down the subplan and frees resources allocated + * to this node. + * ---------------------------------------------------------------- + */ + void + ExecEndHashDistinct(HashDistinctState *node) + { + /* clean up tuple table */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + MemoryContextDelete(node->tempContext); + MemoryContextDelete(node->tableContext); + + ExecEndNode(outerPlanState(node)); + } + + + void + ExecReScanHashDistinct(HashDistinctState *node, ExprContext *exprCtxt) + { + HashDistinct *plannode = (HashDistinct *) node->ps.plan; + + /* must clear result tuple so first input tuple is returned */ + ExecClearTuple(node->ps.ps_ResultTupleSlot); + + /* release temporary storage */ + MemoryContextReset(node->tempContext); + + /* rebuild the hash table */ + execTuplesHashPrepare(ExecGetResultType(&node->ps), + plannode->numCols, + plannode->uniqColIdx, + &node->eqfunctions, + &node->hashfunctions); + + node->hashtable = BuildTupleHashTable(plannode->numCols, + plannode->uniqColIdx, + node->eqfunctions, + node->hashfunctions, + plannode->numDistinct, + sizeof(TupleHashEntryData), + node->tableContext, + node->tempContext); + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (((PlanState *) node)->lefttree->chgParam == NULL) + ExecReScan(((PlanState *) node)->lefttree, exprCtxt); + } diff -cr --new-file pgsql/src/backend/nodes/copyfuncs.c pgsql-hashdistinct/src/backend/nodes/copyfuncs.c *** pgsql/src/backend/nodes/copyfuncs.c 2006-06-08 14:42:46.000000000 -0400 --- pgsql-hashdistinct/src/backend/nodes/copyfuncs.c 2006-08-02 12:23:38.000000000 -0400 *************** *** 556,561 **** --- 556,585 ---- } /* + * _copyHashDistinct + */ + static HashDistinct * + _copyHashDistinct(HashDistinct *from) + { + HashDistinct *newnode = makeNode(HashDistinct); + + /* + * copy node superclass fields + */ + CopyPlanFields((Plan *) from, (Plan *) newnode); + + /* + * copy remainder of node + */ + COPY_SCALAR_FIELD(numCols); + COPY_SCALAR_FIELD(numDistinct); + COPY_POINTER_FIELD(uniqColIdx, from->numCols * sizeof(AttrNumber)); + + return newnode; + } + + + /* * _copyHash */ static Hash * *************** *** 2831,2836 **** --- 2855,2863 ---- case T_Unique: retval = _copyUnique(from); break; + case T_HashDistinct: + retval = _copyHashDistinct(from); + break; case T_Hash: retval = _copyHash(from); break; diff -cr --new-file pgsql/src/backend/nodes/outfuncs.c pgsql-hashdistinct/src/backend/nodes/outfuncs.c *** pgsql/src/backend/nodes/outfuncs.c 2006-04-30 14:30:39.000000000 -0400 --- pgsql-hashdistinct/src/backend/nodes/outfuncs.c 2006-06-22 15:07:45.000000000 -0400 *************** *** 520,525 **** --- 520,541 ---- } static void + _outHashDistinct(StringInfo str, HashDistinct *node) + { + int i; + + WRITE_NODE_TYPE("HASHDISTINCT"); + + _outPlanInfo(str, (Plan *) node); + + WRITE_INT_FIELD(numCols); + + appendStringInfo(str, " :uniqColIdx"); + for (i = 0; i < node->numCols; i++) + appendStringInfo(str, " %d", node->uniqColIdx[i]); + } + + static void _outSetOp(StringInfo str, SetOp *node) { int i; *************** *** 1900,1905 **** --- 1916,1924 ---- case T_Unique: _outUnique(str, obj); break; + case T_HashDistinct: + _outHashDistinct(str, obj); + break; case T_SetOp: _outSetOp(str, obj); break; diff -cr --new-file pgsql/src/backend/nodes/print.c pgsql-hashdistinct/src/backend/nodes/print.c *** pgsql/src/backend/nodes/print.c 2006-04-04 15:35:34.000000000 -0400 --- pgsql-hashdistinct/src/backend/nodes/print.c 2006-08-02 12:25:19.000000000 -0400 *************** *** 525,530 **** --- 525,532 ---- return "AGG"; case T_Unique: return "UNIQUE"; + case T_HashDistinct: + return "HASHDISTINCT"; case T_SetOp: return "SETOP"; case T_Limit: diff -cr --new-file pgsql/src/backend/optimizer/plan/createplan.c pgsql-hashdistinct/src/backend/optimizer/plan/createplan.c *** pgsql/src/backend/optimizer/plan/createplan.c 2006-05-18 14:57:31.000000000 -0400 --- pgsql-hashdistinct/src/backend/optimizer/plan/createplan.c 2006-08-18 16:52:43.000000000 -0400 *************** *** 170,175 **** --- 170,179 ---- plan = (Plan *) create_unique_plan(root, (UniquePath *) best_path); break; + case T_HashDistinct: + plan = (Plan *) create_unique_plan(root, + (UniquePath *) best_path); + break; default: elog(ERROR, "unrecognized node type: %d", (int) best_path->pathtype); *************** *** 2749,2754 **** --- 2753,2815 ---- /* * distinctList is a list of SortClauses, identifying the targetlist items + * that should be considered by the HashDistinct filter. + */ + HashDistinct * + make_hash_distinct(Plan *lefttree, + List *distinctList, + long numDistinct) + { + HashDistinct *node = makeNode(HashDistinct); + Plan *plan = &node->plan; + int numCols = list_length(distinctList); + int keyno = 0; + AttrNumber *uniqColIdx; + ListCell *slitem; + + copy_plan_costsize(plan, lefttree); + + /* + * Charge one cpu_operator_cost per comparison per input tuple. We assume + * all columns get compared at most of the tuples. (XXX probably this is + * an overestimate.) + */ + plan->total_cost += cpu_operator_cost * plan->plan_rows * numCols; + + /* + * plan->plan_rows is left as a copy of the input subplan's plan_rows; ie, + * we assume the filter removes nothing. The caller must alter this if he + * has a better idea. + */ + + plan->targetlist = copyObject(lefttree->targetlist); + plan->qual = NIL; + plan->lefttree = lefttree; + plan->righttree = NULL; + + /* + * convert SortClause list into array of attr indexes, as wanted by exec + */ + Assert(numCols > 0); + uniqColIdx = (AttrNumber *) palloc(sizeof(AttrNumber) * numCols); + + foreach(slitem, distinctList) + { + SortClause *sortcl = (SortClause *) lfirst(slitem); + TargetEntry *tle = get_sortgroupclause_tle(sortcl, plan->targetlist); + + uniqColIdx[keyno++] = tle->resno; + } + + node->numCols = numCols; + node->uniqColIdx = uniqColIdx; + node->numDistinct = numDistinct; + + return node; + } + + /* + * distinctList is a list of SortClauses, identifying the targetlist items * that should be considered by the SetOp filter. */ *************** *** 2931,2936 **** --- 2992,2998 ---- case T_Material: case T_Sort: case T_Unique: + case T_HashDistinct: case T_SetOp: case T_Limit: case T_Append: diff -cr --new-file pgsql/src/backend/optimizer/plan/planner.c pgsql-hashdistinct/src/backend/optimizer/plan/planner.c *** pgsql/src/backend/optimizer/plan/planner.c 2006-03-05 10:58:29.000000000 -0500 --- pgsql-hashdistinct/src/backend/optimizer/plan/planner.c 2006-08-20 22:04:49.000000000 -0400 *************** *** 73,78 **** --- 73,86 ---- List *sub_tlist, AttrNumber *groupColIdx); static List *postprocess_setop_tlist(List *new_tlist, List *orig_tlist); + static bool clause_in_sort_list(SortClause *scl, List *sortList); + static bool choose_hashed_distinct(PlannerInfo *root, + Plan *lefttree, + double tuple_fraction, + double dNumGroups, + List *distinct_pathkeys, + List *sort_pathkeys, + List *current_pathkeys); /***************************************************************************** *************** *** 635,647 **** Plan *result_plan; List *current_pathkeys; List *sort_pathkeys; double dNumGroups = 0; /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */ if (parse->limitCount || parse->limitOffset) tuple_fraction = preprocess_limit(root, tuple_fraction, &offset_est, &count_est); ! if (parse->setOperations) { List *set_sortclauses; --- 643,660 ---- Plan *result_plan; List *current_pathkeys; List *sort_pathkeys; + List *distinct_pathkeys = NIL; double dNumGroups = 0; + long numGroups = 0; + bool use_hashed_distinct; /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */ if (parse->limitCount || parse->limitOffset) tuple_fraction = preprocess_limit(root, tuple_fraction, &offset_est, &count_est); ! if (parse->distinctClause) ! distinct_pathkeys = make_pathkeys_for_sortclauses(parse->distinctClause, tlist); ! if (parse->setOperations) { List *set_sortclauses; *************** *** 710,716 **** Path *cheapest_path; Path *sorted_path; Path *best_path; - long numGroups = 0; AggClauseCounts agg_counts; int numGroupCols = list_length(parse->groupClause); bool use_hashed_grouping = false; --- 723,728 ---- *************** *** 763,773 **** --- 775,792 ---- * BY is a superset of GROUP BY, it would be tempting to request sort * by ORDER BY --- but that might just leave us failing to exploit an * available sort order at all. Needs more thought...) + * + * Addition: Since the distinct list is no longer added to the sort list + * at parse time, requesting a sorted order by the distinct_pathkeys + * allows for the possibility of a sorted order to be exploited by the + * Unique filter. */ if (parse->groupClause) root->query_pathkeys = root->group_pathkeys; else if (parse->sortClause) root->query_pathkeys = root->sort_pathkeys; + else if (parse->distinctClause) + root->query_pathkeys = distinct_pathkeys; else root->query_pathkeys = NIL; *************** *** 1007,1018 **** } /* end of non-minmax-aggregate case */ } /* end of if (setOperations) */ ! /* ! * If we were not able to make the plan come out in the right order, add ! * an explicit sort step. ! */ if (parse->sortClause) { if (!pathkeys_contained_in(sort_pathkeys, current_pathkeys)) { result_plan = (Plan *) --- 1026,1089 ---- } /* end of non-minmax-aggregate case */ } /* end of if (setOperations) */ ! /* ! * If there is a DISTINCT clause, decide which filter to ! * use and make adjustments to the sort list accordingly. ! * ! * Note: Although it seems like a better idea to sort after ! * the DISTINCT filter when using the hash-based algorithmn, ! * it would effectively break DISTINCT ON functionality. ! * Thus, unless we seperate DISTINCT from DISTINCT ON (or ! * completely redesign the clauses), it is a necessary evil ! * to always excecute an ORDER BY clause before the DISTINCT ! * clause. ! */ ! if (parse->distinctClause) ! { ! distinct_pathkeys = canonicalize_pathkeys(root, distinct_pathkeys); ! use_hashed_distinct = choose_hashed_distinct(root, result_plan, tuple_fraction, ! dNumGroups, distinct_pathkeys, ! sort_pathkeys, current_pathkeys); ! ! if (!use_hashed_distinct) ! { ! /* ! * The sort list only needs to be adjusted for the Unique filter ! * if there is an ORDER BY clause and DISTINCT is not a subset of it. ! * ! * If there is no ORDER BY clause, then the distinct list can simply ! * be substituted for the sort list. ! */ ! if (sort_pathkeys) ! { ! if(!pathkeys_contained_in(distinct_pathkeys, sort_pathkeys)) ! { ! ListCell *dlitem; ! ! foreach (dlitem, parse->distinctClause) ! { ! SortClause *scl = (SortClause *) lfirst(dlitem); ! if (!clause_in_sort_list(scl, parse->sortClause)) ! parse->sortClause = lappend(parse->sortClause, copyObject(scl)); ! } ! ! /* force a sort */ ! current_pathkeys = NIL; ! } ! } ! else ! { ! parse->sortClause = parse->distinctClause; ! sort_pathkeys = distinct_pathkeys; ! } ! } ! } ! if (parse->sortClause) { + /* If we were not able to make the plan come out in the right order, add + * an explicit sort step. + */ if (!pathkeys_contained_in(sort_pathkeys, current_pathkeys)) { result_plan = (Plan *) *************** *** 1022,1035 **** current_pathkeys = sort_pathkeys; } } ! /* ! * If there is a DISTINCT clause, add the UNIQUE node. */ if (parse->distinctClause) ! { ! result_plan = (Plan *) make_unique(result_plan, parse->distinctClause); /* * If there was grouping or aggregation, leave plan_rows as-is (ie, * assume the result was already mostly unique). If not, use the --- 1093,1116 ---- current_pathkeys = sort_pathkeys; } } ! /* ! * If there is a DISTINCT clause, add the filtering node. */ if (parse->distinctClause) ! { ! if (use_hashed_distinct) { ! /* convert # groups to long int */ ! numGroups = (long) Min(dNumGroups, (double) LONG_MAX); ! ! result_plan = (Plan *) make_hash_distinct(result_plan, ! parse->distinctClause, ! numGroups); ! } else { ! result_plan = (Plan *) make_unique(result_plan, parse->distinctClause); ! } + /* * If there was grouping or aggregation, leave plan_rows as-is (ie, * assume the result was already mostly unique). If not, use the *************** *** 1037,1043 **** */ if (!parse->groupClause && !root->hasHavingQual && !parse->hasAggs) result_plan->plan_rows = dNumGroups; ! } /* * Finally, if there is a LIMIT/OFFSET clause, add the LIMIT node. --- 1118,1124 ---- */ if (!parse->groupClause && !root->hasHavingQual && !parse->hasAggs) result_plan->plan_rows = dNumGroups; ! } /* * Finally, if there is a LIMIT/OFFSET clause, add the LIMIT node. *************** *** 1618,1620 **** --- 1699,1761 ---- elog(ERROR, "resjunk output columns are not implemented"); return new_tlist; } + + static bool clause_in_sort_list(SortClause *scl, List *sortList) { + ListCell *slitem; + + foreach (slitem, sortList) { + SortClause *current = (SortClause *) lfirst(slitem); + + if (scl->tleSortGroupRef == current->tleSortGroupRef) + return true; + } + + return false; + } + + static bool choose_hashed_distinct(PlannerInfo *root, Plan *lefttree, + double tuple_fraction, double dNumGroups, + List *distinct_pathkeys, List *sort_pathkeys, + List *current_pathkeys) { + + Path unique_path; + Path hashed_path; + /* + * If the best_path is already ordered for the Unique filter, or + * there is an explicit sort requested on the distinct clauses + * the Unique filter will be better. + */ + if (pathkeys_contained_in(distinct_pathkeys, sort_pathkeys) || + pathkeys_contained_in(distinct_pathkeys, current_pathkeys)) + { + return false; + } + + /* + * If it doesn't look like the hash table will fit in memory, don't + * hash. + */ + if (lefttree->plan_width * dNumGroups > work_mem * 1024L) + return false; + + /* + * Since cost_sort currently doesn't care how many keys we're sorting + * by, if there is an explicit sort, we might as well use the Unique + * filter (it uses less memory). + * + * If this were not the case, we would have calculated the cost of + * sorting (if needed) sort_pathkeys against the cost of sorting + * list_union(sort_pathkeys, distinct_pathkeys). + */ + if (sort_pathkeys) { + return false; + } + + /* + * Assuming that if the hash table will fit in memory and we're not + * explicitly sorting in any way, hashing would be better. + * + * Note: Not sure if this is covering all the possibilities. + */ + return true; + } diff -cr --new-file pgsql/src/backend/optimizer/plan/setrefs.c pgsql-hashdistinct/src/backend/optimizer/plan/setrefs.c *** pgsql/src/backend/optimizer/plan/setrefs.c 2006-03-05 10:58:30.000000000 -0500 --- pgsql-hashdistinct/src/backend/optimizer/plan/setrefs.c 2006-08-02 12:50:55.000000000 -0400 *************** *** 213,218 **** --- 213,219 ---- case T_Material: case T_Sort: case T_Unique: + case T_HashDistinct: case T_SetOp: /* *************** *** 546,551 **** --- 547,553 ---- case T_Material: case T_Sort: case T_Unique: + case T_HashDistinct: case T_SetOp: /* diff -cr --new-file pgsql/src/backend/optimizer/plan/subselect.c pgsql-hashdistinct/src/backend/optimizer/plan/subselect.c *** pgsql/src/backend/optimizer/plan/subselect.c 2006-05-02 20:24:56.000000000 -0400 --- pgsql-hashdistinct/src/backend/optimizer/plan/subselect.c 2006-08-02 12:33:00.000000000 -0400 *************** *** 1169,1174 **** --- 1169,1175 ---- case T_Material: case T_Sort: case T_Unique: + case T_HashDistinct: case T_SetOp: case T_Group: break; diff -cr --new-file pgsql/src/backend/parser/parse_clause.c pgsql-hashdistinct/src/backend/parser/parse_clause.c *** pgsql/src/backend/parser/parse_clause.c 2006-03-15 19:31:55.000000000 -0500 --- pgsql-hashdistinct/src/backend/parser/parse_clause.c 2006-08-15 13:07:24.000000000 -0400 *************** *** 1479,1496 **** { /* We had SELECT DISTINCT */ ! /* ! * All non-resjunk elements from target list that are not already in ! * the sort list should be added to it. (We don't really care what ! * order the DISTINCT fields are checked in, so we can leave the ! * user's ORDER BY spec alone, and just add additional sort keys to it ! * to ensure that all targetlist items get sorted.) ! */ ! *sortClause = addAllTargetsToSortList(pstate, ! *sortClause, ! *targetlist, ! true); ! /* * Now, DISTINCT list consists of all non-resjunk sortlist items. * Actually, all the sortlist items had better be non-resjunk! --- 1479,1496 ---- { /* We had SELECT DISTINCT */ ! // /* ! // * All non-resjunk elements from target list that are not already in ! // * the sort list should be added to it. (We don't really care what ! // * order the DISTINCT fields are checked in, so we can leave the ! // * user's ORDER BY spec alone, and just add additional sort keys to it ! // * to ensure that all targetlist items get sorted.) ! // */ ! // *sortClause = addAllTargetsToSortList(pstate, ! // *sortClause, ! // *targetlist, ! // true); ! // /* * Now, DISTINCT list consists of all non-resjunk sortlist items. * Actually, all the sortlist items had better be non-resjunk! *************** *** 1507,1515 **** ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), errmsg("for SELECT DISTINCT, ORDER BY expressions must appear in select list"))); ! else ! result = lappend(result, copyObject(scl)); } } else { --- 1507,1516 ---- ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), errmsg("for SELECT DISTINCT, ORDER BY expressions must appear in select list"))); ! // else ! // result = lappend(result, copyObject(scl)); } + result = addAllTargetsToSortList(pstate, result, *targetlist, true); } else { *************** *** 1552,1577 **** } else { ! *sortClause = addTargetToSortList(pstate, tle, ! *sortClause, *targetlist, SORTBY_ASC, NIL, true); ! /* ! * Probably, the tle should always have been added at the end ! * of the sort list ... but search to be safe. ! */ ! foreach(slitem, *sortClause) ! { ! SortClause *scl = (SortClause *) lfirst(slitem); ! ! if (tle->ressortgroupref == scl->tleSortGroupRef) ! { ! result = lappend(result, copyObject(scl)); ! break; ! } ! } ! if (slitem == NULL) /* should not happen */ ! elog(ERROR, "failed to add DISTINCT ON clause to target list"); } } } --- 1553,1581 ---- } else { ! // *sortClause = addTargetToSortList(pstate, tle, ! // *sortClause, *targetlist, ! // SORTBY_ASC, NIL, true); ! result = addTargetToSortList(pstate, tle, ! result, *targetlist, SORTBY_ASC, NIL, true); ! // /* ! // * Probably, the tle should always have been added at the end ! // * of the sort list ... but search to be safe. ! // */ ! // foreach(slitem, *sortClause) ! // { ! // SortClause *scl = (SortClause *) lfirst(slitem); ! // ! // if (tle->ressortgroupref == scl->tleSortGroupRef) ! // { ! // result = lappend(result, copyObject(scl)); ! // break; ! // } ! // } ! // if (slitem == NULL) /* should not happen */ ! // elog(ERROR, "failed to add DISTINCT ON clause to target list"); } } } diff -cr --new-file pgsql/src/.cdtproject pgsql-hashdistinct/src/.cdtproject *** pgsql/src/.cdtproject 1969-12-31 19:00:00.000000000 -0500 --- pgsql-hashdistinct/src/.cdtproject 2006-06-12 11:23:46.000000000 -0400 *************** *** 0 **** --- 1,56 ---- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff -cr --new-file pgsql/src/include/executor/nodeHashDistinct.h pgsql-hashdistinct/src/include/executor/nodeHashDistinct.h *** pgsql/src/include/executor/nodeHashDistinct.h 1969-12-31 19:00:00.000000000 -0500 --- pgsql-hashdistinct/src/include/executor/nodeHashDistinct.h 2006-06-13 21:52:31.000000000 -0400 *************** *** 0 **** --- 1,25 ---- + /*------------------------------------------------------------------------- + * + * nodeHashDistinct.h + * + * + * + * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * $PostgreSQL: pgsql/src/include/executor/nodeUnique.h,v 1.22 2006/03/05 15:58:56 momjian Exp $ + * + *------------------------------------------------------------------------- + */ + #ifndef NODEHAHSDISTINCT_H + #define NODEHASHDISTINCT_H + + #include "nodes/execnodes.h" + + extern int ExecCountSlotsHashDistinct(HashDistinct *node); + extern HashDistinctState *ExecInitHashDistinct(HashDistinct *node, EState *estate, int eflags); + extern TupleTableSlot *ExecHashDistinct(HashDistinctState *node); + extern void ExecEndHashDistinct(HashDistinctState *node); + extern void ExecReScanHashDistinct(HashDistinctState *node, ExprContext *exprCtxt); + + #endif /* NODEHASHDISTINCT_H */ diff -cr --new-file pgsql/src/include/nodes/execnodes.h pgsql-hashdistinct/src/include/nodes/execnodes.h *** pgsql/src/include/nodes/execnodes.h 2006-04-30 14:30:40.000000000 -0400 --- pgsql-hashdistinct/src/include/nodes/execnodes.h 2006-06-22 14:22:03.000000000 -0400 *************** *** 1268,1273 **** --- 1268,1288 ---- } UniqueState; /* ---------------- + * UniqueState information + * ---------------- + */ + typedef struct HashDistinctState + { + PlanState ps; /* its first field is NodeTag */ + FmgrInfo *eqfunctions; /* per-field lookup data for equality fns */ + FmgrInfo *hashfunctions; /* per-field lookup data for hash functions */ + MemoryContext tempContext; /* short-term context for comparisons */ + MemoryContext tableContext; /* long-term context for the hash table */ + TupleHashTable hashtable; /* hash table with one entry per distinct tuple */ + } HashDistinctState; + + + /* ---------------- * HashState information * ---------------- */ diff -cr --new-file pgsql/src/include/nodes/nodes.h pgsql-hashdistinct/src/include/nodes/nodes.h *** pgsql/src/include/nodes/nodes.h 2006-04-30 14:30:40.000000000 -0400 --- pgsql-hashdistinct/src/include/nodes/nodes.h 2006-06-22 14:55:06.000000000 -0400 *************** *** 63,68 **** --- 63,69 ---- T_Group, T_Agg, T_Unique, + T_HashDistinct, T_Hash, T_SetOp, T_Limit, *************** *** 94,99 **** --- 95,101 ---- T_GroupState, T_AggState, T_UniqueState, + T_HashDistinctState, T_HashState, T_SetOpState, T_LimitState, diff -cr --new-file pgsql/src/include/nodes/plannodes.h pgsql-hashdistinct/src/include/nodes/plannodes.h *** pgsql/src/include/nodes/plannodes.h 2006-03-05 10:58:57.000000000 -0500 --- pgsql-hashdistinct/src/include/nodes/plannodes.h 2006-06-22 14:24:21.000000000 -0400 *************** *** 430,435 **** --- 430,447 ---- } Unique; /* ---------------- + * hashdistinct node + * ---------------- + */ + typedef struct HashDistinct + { + Plan plan; + int numCols; /* number of columns to check for uniqueness */ + int numDistinct; /* estimated number of distinct tuples */ + AttrNumber *uniqColIdx; /* indexes into the target list */ + } HashDistinct; + + /* ---------------- * hash build node * ---------------- */ diff -cr --new-file pgsql/src/include/optimizer/planmain.h pgsql-hashdistinct/src/include/optimizer/planmain.h *** pgsql/src/include/optimizer/planmain.h 2006-03-05 10:58:57.000000000 -0500 --- pgsql-hashdistinct/src/include/optimizer/planmain.h 2006-08-18 16:52:20.000000000 -0400 *************** *** 55,60 **** --- 55,61 ---- extern Material *make_material(Plan *lefttree); extern Plan *materialize_finished_plan(Plan *subplan); extern Unique *make_unique(Plan *lefttree, List *distinctList); + extern HashDistinct *make_hash_distinct(Plan *lefttree, List *distinctList, long numDistinct); extern Limit *make_limit(Plan *lefttree, Node *limitOffset, Node *limitCount, int offset_est, int count_est); extern SetOp *make_setop(SetOpCmd cmd, Plan *lefttree, diff -cr --new-file pgsql/src/interfaces/libpq/libpq.rc pgsql-hashdistinct/src/interfaces/libpq/libpq.rc *** pgsql/src/interfaces/libpq/libpq.rc 2006-06-04 05:32:13.000000000 -0400 --- pgsql-hashdistinct/src/interfaces/libpq/libpq.rc 2006-06-22 15:01:34.000000000 -0400 *************** *** 1,8 **** #include VS_VERSION_INFO VERSIONINFO ! FILEVERSION 8,2,0,6155 ! PRODUCTVERSION 8,2,0,6155 FILEFLAGSMASK 0x3fL FILEFLAGS 0 FILEOS VOS__WINDOWS32 --- 1,8 ---- #include VS_VERSION_INFO VERSIONINFO ! FILEVERSION 8,2,0,6173 ! PRODUCTVERSION 8,2,0,6173 FILEFLAGSMASK 0x3fL FILEFLAGS 0 FILEOS VOS__WINDOWS32 diff -cr --new-file pgsql/src/.project pgsql-hashdistinct/src/.project *** pgsql/src/.project 1969-12-31 19:00:00.000000000 -0500 --- pgsql-hashdistinct/src/.project 2006-06-12 11:23:45.000000000 -0400 *************** *** 0 **** --- 1,84 ---- + + + PostgreSQL + + + + + + org.eclipse.cdt.make.core.makeBuilder + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.build.command + gmake + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + false + + + org.eclipse.cdt.make.core.build.target.auto + all + + + org.eclipse.cdt.make.core.stopOnError + false + + + org.eclipse.cdt.make.core.build.target.full + clean all + + + org.eclipse.cdt.make.core.build.target.inc + all + + + org.eclipse.cdt.make.core.build.arguments + + + + org.eclipse.cdt.core.errorOutputParser + org.eclipse.cdt.core.MakeErrorParser;org.eclipse.cdt.core.GCCErrorParser;org.eclipse.cdt.core.GASErrorParser;org.eclipse.cdt.core.GLDErrorParser;org.eclipse.cdt.core.VCErrorParser; + + + org.eclipse.cdt.make.core.enableAutoBuild + true + + + org.eclipse.cdt.make.core.environment + + + + org.eclipse.cdt.make.core.enabledIncrementalBuild + true + + + org.eclipse.cdt.make.core.build.target.clean + clean + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + + + org.eclipse.cdt.make.core.ScannerConfigBuilder + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.make.core.makeNature + org.eclipse.cdt.make.core.ScannerConfigNature + +