Skip to content

Commit

Permalink
Add GUC 'gp_random_insert_segments' to control the segments used for …
Browse files Browse the repository at this point in the history
…random distributed table insertion

Introduces the 'gp_random_insert_segments' GUC to reduce the generation of
excessive fragmented files during the insertion of small amounts of data into
clusters with a large number of segments (e.g., 1000 records into 100 segments).

Fragmented data insertion can significantly degrade performance, especially
when using append-optimized or cloud-based storage. By introducing
the 'gp_random_insert_segments' GUC, users can limit the number of segments
used for data insertion in randomly distributed tables, which can significantly
reduce fragmented files.
  • Loading branch information
foreyes committed Apr 17, 2024
1 parent 7c0423c commit 690b060
Show file tree
Hide file tree
Showing 7 changed files with 49 additions and 0 deletions.
14 changes: 14 additions & 0 deletions src/backend/cdb/cdbllize.c
Original file line number Diff line number Diff line change
Expand Up @@ -1332,6 +1332,20 @@ build_slice_table_walker(Node *node, build_slice_table_context *context)
sendSlice->directDispatch.contentIds = list_make1_int(0);
}

if (root->parse->commandType == CMD_INSERT &&
motion->motionType == MOTIONTYPE_HASH &&
motion->plan.locustype == CdbLocusType_Strewn &&
motion->numHashSegments == gp_random_insert_segments)
{
PlanSlice *recvSlice;
/*
* Using limited segments for random distributed data insertion, we
* just enable limited segments to do actual works.
*/
recvSlice = (PlanSlice *) list_nth(context->slices, sendSlice->parentIndex);
recvSlice->numsegments = motion->numHashSegments;
}

result = plan_tree_walker((Node *) motion,
build_slice_table_walker,
context,
Expand Down
9 changes: 9 additions & 0 deletions src/backend/cdb/cdbpath.c
Original file line number Diff line number Diff line change
Expand Up @@ -2611,6 +2611,15 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy,
}
else
elog(ERROR, "unrecognized policy type %u", policyType);

if (CdbPathLocus_IsStrewn(subpath->locus) && subpath->locus.distkey == NIL &&
gp_random_insert_segments > 0 &&
gp_random_insert_segments < CdbPathLocus_NumSegments(subpath->locus))
{
/* Select limited random segments for data insertion. */
subpath->locus.numsegments = gp_random_insert_segments;
}

return subpath;
}

Expand Down
6 changes: 6 additions & 0 deletions src/backend/commands/copyfrom.c
Original file line number Diff line number Diff line change
Expand Up @@ -3324,6 +3324,12 @@ GetTargetSeg(GpDistributionData *distData, TupleTableSlot *slot)

target_seg = cdbhashreduce(cdbHash); /* hash result segment */
}
else if (gp_random_insert_segments > 0 &&
gp_random_insert_segments < policy->numsegments)
{
/* Select limited random segments for data insertion. */
target_seg = cdbhashrandomseg(gp_random_insert_segments);
}
else
{
/*
Expand Down
7 changes: 7 additions & 0 deletions src/backend/gpopt/translate/CTranslatorQueryToDXL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ extern "C" {
#include "nodes/parsenodes.h"
#include "nodes/plannodes.h"
#include "optimizer/walkers.h"
#include "utils/guc.h"
#include "utils/rel.h"
}

Expand Down Expand Up @@ -736,6 +737,12 @@ CTranslatorQueryToDXL::TranslateInsertQueryToDXL()
GPOS_WSZ_LIT("DML not enabled"));
}

if (gp_random_insert_segments > 0)
{
GPOS_RAISE(gpdxl::ExmaDXL, gpdxl::ExmiQuery2DXLUnsupportedFeature,
GPOS_WSZ_LIT("limited insert segments not supported"));
}

CDXLNode *query_dxlnode = TranslateSelectQueryToDXL();
const RangeTblEntry *rte = (RangeTblEntry *) gpdb::ListNth(
m_query->rtable, m_query->resultRelation - 1);
Expand Down
11 changes: 11 additions & 0 deletions src/backend/utils/misc/guc_gp.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ int gp_appendonly_compaction_threshold = 0;
bool enable_parallel = false;
int gp_appendonly_insert_files = 0;
int gp_appendonly_insert_files_tuples_range = 0;
int gp_random_insert_segments = 0;
bool gp_heap_require_relhasoids_match = true;
bool gp_local_distributed_cache_stats = false;
bool debug_xlog_record_read = false;
Expand Down Expand Up @@ -3215,6 +3216,16 @@ struct config_int ConfigureNamesInt_gp[] =
NULL, NULL, NULL
},

{
{"gp_random_insert_segments", PGC_USERSET, CUSTOM_OPTIONS,
gettext_noop("Use limited number of segments for random distributed table insertion."),
NULL
},
&gp_random_insert_segments,
0, 0, INT_MAX,
NULL, NULL, NULL
},

{
{"gp_workfile_max_entries", PGC_POSTMASTER, RESOURCES,
gettext_noop("Sets the maximum number of entries that can be stored in the workfile directory"),
Expand Down
1 change: 1 addition & 0 deletions src/include/utils/guc.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ extern bool gp_appendonly_compaction;
extern bool enable_parallel;
extern int gp_appendonly_insert_files;
extern int gp_appendonly_insert_files_tuples_range;
extern int gp_random_insert_segments;
extern bool enable_answer_query_using_materialized_views;
extern bool enable_offload_entry_to_qe;
/*
Expand Down
1 change: 1 addition & 0 deletions src/include/utils/sync_guc_name.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,4 @@
"gp_resgroup_debug_wait_queue",
"gp_appendonly_insert_files",
"gp_appendonly_insert_files_tuples_range",
"gp_random_insert_segments",

0 comments on commit 690b060

Please sign in to comment.