Skip to content

Commit

Permalink
Add GUC 'gp_random_insert_segments' to control the segments used for …
Browse files Browse the repository at this point in the history
…random distributed table insertion

Introduces the 'gp_random_insert_segments' GUC to reduce the generation of
excessive fragmented files during the insertion of small amounts of data into
clusters with a large number of segments (e.g., 1000 records into 100 segments).

Fragmented data insertion can significantly degrade performance, especially
when using append-optimized or cloud-based storage. By introducing
the 'gp_random_insert_segments' GUC, users can limit the number of segments
used for data insertion in randomly distributed tables, which can significantly
reduce fragmented files.
  • Loading branch information
foreyes committed Apr 11, 2024
1 parent be6897c commit 50efffc
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 0 deletions.
9 changes: 9 additions & 0 deletions src/backend/cdb/cdbpath.c
Original file line number Diff line number Diff line change
Expand Up @@ -2611,6 +2611,15 @@ create_motion_path_for_insert(PlannerInfo *root, GpPolicy *policy,
}
else
elog(ERROR, "unrecognized policy type %u", policyType);

if (CdbPathLocus_IsStrewn(subpath->locus) && subpath->locus.distkey == NIL &&
gp_random_insert_segments > 0 &&
gp_random_insert_segments < CdbPathLocus_NumSegments(subpath->locus))
{
/* Select limited random segments for data insertion. */
subpath->locus.numsegments = gp_random_insert_segments;
}

return subpath;
}

Expand Down
6 changes: 6 additions & 0 deletions src/backend/commands/copyfrom.c
Original file line number Diff line number Diff line change
Expand Up @@ -3324,6 +3324,12 @@ GetTargetSeg(GpDistributionData *distData, TupleTableSlot *slot)

target_seg = cdbhashreduce(cdbHash); /* hash result segment */
}
else if (gp_random_insert_segments > 0 &&
gp_random_insert_segments < policy->numsegments)
{
/* Select limited random segments for data insertion. */
target_seg = cdbhashrandomseg(gp_random_insert_segments);
}
else
{
/*
Expand Down
11 changes: 11 additions & 0 deletions src/backend/utils/misc/guc_gp.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ int gp_appendonly_compaction_threshold = 0;
bool enable_parallel = false;
int gp_appendonly_insert_files = 0;
int gp_appendonly_insert_files_tuples_range = 0;
int gp_random_insert_segments = 0;
bool gp_heap_require_relhasoids_match = true;
bool gp_local_distributed_cache_stats = false;
bool debug_xlog_record_read = false;
Expand Down Expand Up @@ -3215,6 +3216,16 @@ struct config_int ConfigureNamesInt_gp[] =
NULL, NULL, NULL
},

{
{"gp_random_insert_segments", PGC_USERSET, CUSTOM_OPTIONS,
gettext_noop("Use limited number of segments for random distributed table insertion."),
NULL
},
&gp_random_insert_segments,
0, 0, INT_MAX,
NULL, NULL, NULL
},

{
{"gp_workfile_max_entries", PGC_POSTMASTER, RESOURCES,
gettext_noop("Sets the maximum number of entries that can be stored in the workfile directory"),
Expand Down
1 change: 1 addition & 0 deletions src/include/utils/guc.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ extern bool gp_appendonly_compaction;
extern bool enable_parallel;
extern int gp_appendonly_insert_files;
extern int gp_appendonly_insert_files_tuples_range;
extern int gp_random_insert_segments;
extern bool enable_answer_query_using_materialized_views;
extern bool enable_offload_entry_to_qe;
/*
Expand Down
1 change: 1 addition & 0 deletions src/include/utils/sync_guc_name.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,4 @@
"gp_resgroup_debug_wait_queue",
"gp_appendonly_insert_files",
"gp_appendonly_insert_files_tuples_range",
"gp_random_insert_segments",

0 comments on commit 50efffc

Please sign in to comment.