From 693ec2ec502979356fb81e4328110b5539dc3796 Mon Sep 17 00:00:00 2001 From: Liang Mao Date: Wed, 15 May 2024 16:44:04 +0800 Subject: [PATCH] [GC] Add option G1BarrierSimple to use simple g1 post barrier Summary: Provide option G1BarrierSimple to use simple G1 post barrier for better mutator performance Testing: CI pipeline Reviewers: yude, yifeng Issue: https://github.com/dragonwell-project/dragonwell21/issues/56 --- .../gc/g1/g1BarrierSetAssembler_aarch64.cpp | 40 ++++++++++++++++ .../x86/gc/g1/g1BarrierSetAssembler_x86.cpp | 47 +++++++++++++++++++ src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp | 11 +++++ src/hotspot/share/gc/g1/g1Arguments.cpp | 19 ++++++++ src/hotspot/share/gc/g1/g1BarrierSet.cpp | 4 ++ .../share/gc/g1/g1BarrierSet.inline.hpp | 4 ++ src/hotspot/share/gc/g1/g1CardTable.cpp | 10 +++- .../share/gc/g1/g1RedirtyCardsQueue.cpp | 11 +++++ .../share/gc/g1/g1RedirtyCardsQueue.hpp | 2 + src/hotspot/share/gc/g1/g1RemSet.cpp | 39 +++++++++++++-- .../gc/g1/g1YoungGCPostEvacuateTasks.cpp | 6 ++- src/hotspot/share/gc/g1/g1_globals.hpp | 3 ++ .../TestG1ConcRefinementThreads.java | 1 + 13 files changed, 190 insertions(+), 7 deletions(-) diff --git a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp index 42081d422c8..f388fa012c1 100644 --- a/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/gc/g1/g1BarrierSetAssembler_aarch64.cpp @@ -86,6 +86,29 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, Register start, Register count, Register scratch, RegSet saved_regs) { + if (G1BarrierSimple) { + Label L_loop, L_done; + const Register end = count; + + __ cbz(count, L_done); // zero count - nothing to do + + __ lea(end, Address(start, count, Address::lsl(LogBytesPerHeapOop))); // end = start + count << LogBytesPerHeapOop + __ sub(end, end, BytesPerHeapOop); // last element address to make inclusive + __ lsr(start, start, CardTable::card_shift()); + __ lsr(end, end, CardTable::card_shift()); + __ sub(count, end, start); // number of bytes to copy + + __ load_byte_map_base(scratch); + __ add(start, start, scratch); + __ bind(L_loop); + assert((int)CardTable::dirty_card_val() == 0, "must be 0"); + __ strb(zr, Address(start, count)); + __ subs(count, count, 1); + __ br(Assembler::GE, L_loop); + __ bind(L_done); + return; + } + __ push(saved_regs, sp); assert_different_registers(start, count, scratch); assert_different_registers(c_rarg0, count); @@ -204,6 +227,16 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, Label done; Label runtime; + if (G1BarrierSimple) { + const Register card_addr = tmp1; + __ lsr(card_addr, store_addr, CardTable::card_shift()); + __ load_byte_map_base(tmp2); + __ add(card_addr, card_addr, tmp2); + __ strb(zr, Address(card_addr)); + __ bind(done); + return; + } + // Does store cross heap regions? __ eor(tmp1, store_addr, new_val); @@ -444,6 +477,13 @@ void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* __ load_parameter(0, card_offset); __ lsr(card_offset, card_offset, CardTable::card_shift()); __ load_byte_map_base(byte_map_base); + + if (G1BarrierSimple) { + __ strb(zr, Address(byte_map_base, card_offset)); + __ bind(done); + __ epilogue(); + return; + } __ ldrb(rscratch1, Address(byte_map_base, card_offset)); __ cmpw(rscratch1, (int)G1CardTable::g1_young_card_val()); __ br(Assembler::EQ, done); diff --git a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp index f609846f00d..c906cf437ba 100644 --- a/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/gc/g1/g1BarrierSetAssembler_x86.cpp @@ -42,6 +42,8 @@ #define __ masm-> +#define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8) + void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators, Register addr, Register count) { bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0; @@ -98,6 +100,36 @@ void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators, Register addr, Register count, Register tmp) { +#ifdef _LP64 + if (G1BarrierSimple) { + CardTableBarrierSet* ct = + barrier_set_cast(BarrierSet::barrier_set()); + intptr_t disp = (intptr_t) ct->card_table()->byte_map_base(); + + Label L_loop, L_done; + const Register end = count; + assert_different_registers(addr, end); + + __ testl(count, count); + __ jcc(Assembler::zero, L_done); // zero count - nothing to do + + __ leaq(end, Address(addr, count, TIMES_OOP, 0)); // end == addr+count*oop_size + __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive + __ shrptr(addr, CardTable::card_shift()); + __ shrptr(end, CardTable::card_shift()); + __ subptr(end, addr); // end --> cards count + + __ mov64(tmp, disp); + __ addptr(addr, tmp); + __ bind(L_loop); + __ movb(Address(addr, count, Address::times_1), G1CardTable::dirty_card_val()); + __ decrement(count); + __ jcc(Assembler::greaterEqual, L_loop); + __ bind(L_done); + return; + } +#endif + __ push_call_clobbered_registers(false /* save_fpu */); #ifdef _LP64 if (c_rarg0 == count) { // On win64 c_rarg0 == rcx @@ -308,6 +340,12 @@ void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm, __ movptr(cardtable, (intptr_t)ct->card_table()->byte_map_base()); __ addptr(card_addr, cardtable); + if (G1BarrierSimple) { + __ movb(Address(card_addr, 0), G1CardTable::dirty_card_val()); + __ bind(done); + return; + } + __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val()); __ jcc(Assembler::equal, done); @@ -542,6 +580,15 @@ void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* NOT_LP64(__ get_thread(thread);) + if (G1BarrierSimple) { + __ movb(Address(card_addr, 0), G1CardTable::dirty_card_val()); + __ bind(done); + __ pop(rcx); + __ pop(rax); + __ epilogue(); + return; + } + __ cmpb(Address(card_addr, 0), G1CardTable::g1_young_card_val()); __ jcc(Assembler::equal, done); diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp index 0c7f845e671..5fcc68812f7 100644 --- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp +++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp @@ -446,6 +446,13 @@ void G1BarrierSetC2::post_barrier(GraphKit* kit, // Combine card table base and card offset Node* card_adr = __ AddP(no_base, byte_map_base_node(kit), card_offset ); + if (G1BarrierSimple) { + __ store(__ ctrl(), card_adr, dirty_card, T_BYTE, Compile::AliasIdxRaw, MemNode::unordered); + // Final sync IdealKit and GraphKit. + kit->final_sync(ideal); + return; + } + // If we know the value being stored does it cross regions? if (val != nullptr) { @@ -715,6 +722,10 @@ void G1BarrierSetC2::eliminate_gc_barrier(PhaseMacroExpand* macro, Node* node) c macro->replace_node(node, macro->zerocon(node->as_Load()->bottom_type()->basic_type())); } else { assert(node->Opcode() == Op_CastP2X, "ConvP2XNode required"); + if (G1BarrierSimple) { + CardTableBarrierSetC2::eliminate_gc_barrier(macro, node); + return; + } assert(node->outcnt() <= 2, "expects 1 or 2 users: Xor and URShift nodes"); // It could be only one user, URShift node, in Object.clone() intrinsic // but the new allocation is passed to arraycopy stub and it could not diff --git a/src/hotspot/share/gc/g1/g1Arguments.cpp b/src/hotspot/share/gc/g1/g1Arguments.cpp index 28f850938c4..3dae853e045 100644 --- a/src/hotspot/share/gc/g1/g1Arguments.cpp +++ b/src/hotspot/share/gc/g1/g1Arguments.cpp @@ -177,6 +177,25 @@ void G1Arguments::initialize() { FLAG_SET_ERGO(ParallelGCThreads, 1); } + if (G1BarrierSimple) { +#if !defined(_LP64) || !(defined(X86) || defined(AARCH64)) + warning("G1BarrierSimple is not supported with current platform" + "; ignoring G1BarrierSimple flag."); + FLAG_SET_DEFAULT(G1BarrierSimple, false); +#else +#if INCLUDE_JVMCI + if (EnableJVMCI) { + warning("G1BarrierSimple is incompatible with JVMCI" + "; ignoring G1BarrierSimple flag."); + FLAG_SET_DEFAULT(G1BarrierSimple, false); + } else +#endif + { + FLAG_SET_DEFAULT(G1UseConcRefinement, false); + } +#endif + } + if (!G1UseConcRefinement) { if (!FLAG_IS_DEFAULT(G1ConcRefinementThreads)) { log_warning(gc, ergo)("Ignoring -XX:G1ConcRefinementThreads " diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.cpp b/src/hotspot/share/gc/g1/g1BarrierSet.cpp index 3f2b27c2d06..c9367ead4d1 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.cpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.cpp @@ -107,6 +107,10 @@ void G1BarrierSet::invalidate(JavaThread* thread, MemRegion mr) { } volatile CardValue* byte = _card_table->byte_for(mr.start()); CardValue* last_byte = _card_table->byte_for(mr.last()); + if (G1BarrierSimple) { + memset((void*)byte, G1CardTable::dirty_card_val(), last_byte - byte + 1); + return; + } // skip young gen cards if (*byte == G1CardTable::g1_young_card_val()) { diff --git a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp index e5b477ad156..1cfaeec9aa8 100644 --- a/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp +++ b/src/hotspot/share/gc/g1/g1BarrierSet.inline.hpp @@ -83,6 +83,10 @@ inline void G1BarrierSet::write_ref_array_work(MemRegion mr) { template inline void G1BarrierSet::write_ref_field_post(T* field) { volatile CardValue* byte = _card_table->byte_for(field); + if (G1BarrierSimple) { + *byte = G1CardTable::dirty_card_val(); + return; + } if (*byte != G1CardTable::g1_young_card_val()) { // Take a slow path for cards in old write_ref_field_post_slow(byte); diff --git a/src/hotspot/share/gc/g1/g1CardTable.cpp b/src/hotspot/share/gc/g1/g1CardTable.cpp index 0dc845825d6..b8aaebaf60c 100644 --- a/src/hotspot/share/gc/g1/g1CardTable.cpp +++ b/src/hotspot/share/gc/g1/g1CardTable.cpp @@ -29,6 +29,9 @@ #include "logging/log.hpp" void G1CardTable::g1_mark_as_young(const MemRegion& mr) { + if (G1BarrierSimple) { + return; + } CardValue *const first = byte_for(mr.start()); CardValue *const last = byte_after(mr.last()); @@ -37,7 +40,9 @@ void G1CardTable::g1_mark_as_young(const MemRegion& mr) { #ifndef PRODUCT void G1CardTable::verify_g1_young_region(MemRegion mr) { - verify_region(mr, g1_young_gen, true); + if (!G1BarrierSimple) { + verify_region(mr, g1_young_gen, true); + } } #endif @@ -69,6 +74,9 @@ void G1CardTable::initialize(G1RegionToSpaceMapper* mapper) { } bool G1CardTable::is_in_young(const void* p) const { + if (G1BarrierSimple) { + return G1CollectedHeap::heap()->heap_region_containing(p)->is_young(); + } volatile CardValue* card = byte_for(p); return *card == G1CardTable::g1_young_card_val(); } diff --git a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp index a9a06b2c82e..d8dd4a34716 100644 --- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp +++ b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.cpp @@ -118,6 +118,17 @@ BufferNodeList G1RedirtyCardsQueueSet::take_all_completed_buffers() { return result; } +void G1RedirtyCardsQueueSet::abandon_completed_buffers() { + BufferNodeList list = take_all_completed_buffers(); + BufferNode* buffers_to_delete = list._head; + while (buffers_to_delete != nullptr) { + BufferNode* bn = buffers_to_delete; + buffers_to_delete = bn->next(); + bn->set_next(nullptr); + deallocate_buffer(bn); + } +} + void G1RedirtyCardsQueueSet::update_tail(BufferNode* node) { // Node is the tail of a (possibly single element) list just prepended to // _list. If, after that prepend, node's follower is null, then node is diff --git a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp index b464d377298..e687f55fc8c 100644 --- a/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp +++ b/src/hotspot/share/gc/g1/g1RedirtyCardsQueue.hpp @@ -90,6 +90,8 @@ class G1RedirtyCardsQueueSet : public PtrQueueSet { // precondition: Must not be concurrent with buffer collection. BufferNode* all_completed_buffers() const; BufferNodeList take_all_completed_buffers(); + + void abandon_completed_buffers(); }; #endif // SHARE_GC_G1_G1REDIRTYCARDSQUEUE_HPP diff --git a/src/hotspot/share/gc/g1/g1RemSet.cpp b/src/hotspot/share/gc/g1/g1RemSet.cpp index b29125037b7..b4dd0aadb13 100644 --- a/src/hotspot/share/gc/g1/g1RemSet.cpp +++ b/src/hotspot/share/gc/g1/g1RemSet.cpp @@ -1253,6 +1253,27 @@ class G1MergeHeapRootsTask : public WorkerTask { size_t cards_skipped() const { return _cards_skipped; } }; + class G1MergeSimpleDirtyCardsRegionClosure : public HeapRegionClosure { + private: + G1RemSetScanState* _scan_state; + G1CardTable* _ct; + + public: + G1MergeSimpleDirtyCardsRegionClosure(G1CollectedHeap* g1h, G1RemSetScanState* scan_state) : + _scan_state(scan_state), + _ct(g1h->card_table()) + {} + + bool do_heap_region(HeapRegion* r) { + if (r->is_old_or_humongous() && !r->in_collection_set()) { + uint region_index = r->hrm_index(); + _scan_state->add_dirty_region(region_index); + _scan_state->set_chunk_range_dirty(region_index << HeapRegion::LogCardsPerRegion, HeapRegion::CardsPerRegion); + } + return false; + } + }; + HeapRegionClaimer _hr_claimer; G1RemSetScanState* _scan_state; BufferNode::Stack _dirty_card_buffers; @@ -1278,7 +1299,7 @@ class G1MergeHeapRootsTask : public WorkerTask { _initial_evacuation(initial_evacuation), _fast_reclaim_handled(false) { - if (initial_evacuation) { + if (initial_evacuation && !G1BarrierSimple) { G1DirtyCardQueueSet& dcqs = G1BarrierSet::dirty_card_queue_set(); BufferNodeList buffers = dcqs.take_all_completed_buffers(); if (buffers._entry_count != 0) { @@ -1338,11 +1359,19 @@ class G1MergeHeapRootsTask : public WorkerTask { assert(merge_remset_phase == G1GCPhaseTimes::MergeRS, "Wrong merge phase"); G1GCParPhaseTimesTracker x(p, G1GCPhaseTimes::MergeLB, worker_id); - G1MergeLogBufferCardsClosure cl(g1h, _scan_state); - apply_closure_to_dirty_card_buffers(&cl, worker_id); + if (G1BarrierSimple) { + G1MergeSimpleDirtyCardsRegionClosure cl(g1h, _scan_state); + G1CollectedHeap::heap()->heap_region_par_iterate_from_worker_offset(&cl, &_hr_claimer, worker_id); + p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, 0, G1GCPhaseTimes::MergeLBDirtyCards); + p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, 0, G1GCPhaseTimes::MergeLBSkippedCards); + } else { + G1MergeLogBufferCardsClosure cl(g1h, _scan_state); + apply_closure_to_dirty_card_buffers(&cl, worker_id); + + p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_dirty(), G1GCPhaseTimes::MergeLBDirtyCards); + p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_skipped(), G1GCPhaseTimes::MergeLBSkippedCards); + } - p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_dirty(), G1GCPhaseTimes::MergeLBDirtyCards); - p->record_thread_work_item(G1GCPhaseTimes::MergeLB, worker_id, cl.cards_skipped(), G1GCPhaseTimes::MergeLBSkippedCards); } } }; diff --git a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp index 6edf87bb991..5feb229aad3 100644 --- a/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp +++ b/src/hotspot/share/gc/g1/g1YoungGCPostEvacuateTasks.cpp @@ -386,7 +386,11 @@ class G1PostEvacuateCollectionSetCleanupTask2::RedirtyLoggedCardsTask : public G virtual ~RedirtyLoggedCardsTask() { G1DirtyCardQueueSet& dcq = G1BarrierSet::dirty_card_queue_set(); - dcq.merge_bufferlists(_rdcqs); + if (G1BarrierSimple) { + _rdcqs->abandon_completed_buffers(); + } else { + dcq.merge_bufferlists(_rdcqs); + } _rdcqs->verify_empty(); } diff --git a/src/hotspot/share/gc/g1/g1_globals.hpp b/src/hotspot/share/gc/g1/g1_globals.hpp index 4a0433621f5..6fa08cc2c90 100644 --- a/src/hotspot/share/gc/g1/g1_globals.hpp +++ b/src/hotspot/share/gc/g1/g1_globals.hpp @@ -323,6 +323,9 @@ "related prediction sample. That sample must involve the same or "\ "more than that number of cards to be used.") \ \ + product(bool, G1BarrierSimple, false, \ + "Use simple G1 post barrier") \ + \ GC_G1_EVACUATION_FAILURE_FLAGS(develop, \ develop_pd, \ product, \ diff --git a/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java b/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java index 91be97782f5..76c68a763d2 100644 --- a/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java +++ b/test/hotspot/jtreg/gc/arguments/TestG1ConcRefinementThreads.java @@ -27,6 +27,7 @@ * @test TestG1ConcRefinementThreads * @bug 8047976 * @requires vm.gc.G1 & vm.opt.G1ConcRefinementThreads == null + * @requires vm.opt.G1BarrierSimple == null * @summary Tests argument processing for G1ConcRefinementThreads * @library /test/lib * @library /