diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 1e734193850eee..f0dca5e51dbe15 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1231,9 +1231,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX unsigned genMove2IfNeeded(unsigned size, regNumber tmpReg, GenTree* srcAddr, unsigned offset); unsigned genMove1IfNeeded(unsigned size, regNumber tmpReg, GenTree* srcAddr, unsigned offset); void genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset); + void genStoreRegToStackArg(var_types type, regNumber reg, int offset); void genStructPutArgRepMovs(GenTreePutArgStk* putArgStkNode); void genStructPutArgUnroll(GenTreePutArgStk* putArgStkNode); - void genStoreRegToStackArg(var_types type, regNumber reg, int offset); +#ifdef TARGET_X86 + void genStructPutArgPush(GenTreePutArgStk* putArgStkNode); +#else + void genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgStkNode); +#endif #endif // FEATURE_PUT_STRUCT_ARG_STK void genCodeForStoreBlk(GenTreeBlk* storeBlkNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 25cd51d4a48a98..ab7f5089b31c83 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3337,108 +3337,64 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) GenTree* src = putArgNode->AsOp()->gtOp1; // We will never call this method for SIMD types, which are stored directly // in genPutStructArgStk(). - noway_assert(src->TypeGet() == TYP_STRUCT); + assert(src->isContained() && src->OperIs(GT_OBJ) && src->TypeIs(TYP_STRUCT)); + assert(!src->AsObj()->GetLayout()->HasGCPtr()); +#ifdef TARGET_X86 + assert(!m_pushStkArg); +#endif unsigned size = putArgNode->GetStackByteSize(); - assert(size <= CPBLK_UNROLL_LIMIT); - - emitter* emit = GetEmitter(); - unsigned putArgOffset = putArgNode->getArgOffset(); - - assert(src->isContained()); - - assert(src->gtOper == GT_OBJ); + assert((XMM_REGSIZE_BYTES <= size) && (size <= CPBLK_UNROLL_LIMIT)); if (src->AsOp()->gtOp1->isUsedFromReg()) { genConsumeReg(src->AsOp()->gtOp1); } - unsigned offset = 0; - + unsigned offset = 0; regNumber xmmTmpReg = REG_NA; regNumber intTmpReg = REG_NA; regNumber longTmpReg = REG_NA; -#ifdef TARGET_X86 - // On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's - // less than 16 bytes, we will just be using pushes - if (size >= 8) - { - xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT); - longTmpReg = xmmTmpReg; - } - if ((size & 0x7) != 0) - { - intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); - } -#else // !TARGET_X86 - // On x64 we use an XMM register only for 16-byte chunks. + if (size >= XMM_REGSIZE_BYTES) { xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT); } - if ((size & 0xf) != 0) + if ((size % XMM_REGSIZE_BYTES) != 0) { - intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); - longTmpReg = intTmpReg; + intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); } -#endif // !TARGET_X86 - // If the size of this struct is larger than 16 bytes - // let's use SSE2 to be able to do 16 byte at a time - // loads and stores. - if (size >= XMM_REGSIZE_BYTES) - { #ifdef TARGET_X86 - assert(!m_pushStkArg); -#endif // TARGET_X86 - size_t slots = size / XMM_REGSIZE_BYTES; - - assert(putArgNode->gtGetOp1()->isContained()); - assert(putArgNode->gtGetOp1()->AsOp()->gtOper == GT_OBJ); + longTmpReg = xmmTmpReg; +#else + longTmpReg = intTmpReg; +#endif + // Let's use SSE2 to be able to do 16 byte at a time with loads and stores. + size_t slots = size / XMM_REGSIZE_BYTES; + while (slots-- > 0) + { // TODO: In the below code the load and store instructions are for 16 bytes, but the - // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but - // this probably needs to be changed. - while (slots-- > 0) - { - // Load - genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset); + // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but + // this probably needs to be changed. - // Store - genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); + // Load + genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset); + // Store + genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); - offset += XMM_REGSIZE_BYTES; - } + offset += XMM_REGSIZE_BYTES; } // Fill the remainder (15 bytes or less) if there's one. - if ((size & 0xf) != 0) + if ((size % XMM_REGSIZE_BYTES) != 0) { -#ifdef TARGET_X86 - if (m_pushStkArg) - { - // This case is currently supported only for the case where the total size is - // less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse - // order. However, morph has ensured that we have a struct that is an even - // multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment. - assert(((size & 0xc) == size) && (offset == 0)); - // If we have a 4 byte chunk, load it from either offset 0 or 8, depending on - // whether we've got an 8 byte chunk, and then push it on the stack. - unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, size & 0x8); - // Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk) - // and push it on the stack. - pushedBytes += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, 0); - } - else -#endif // TARGET_X86 - { - offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - assert(offset == size); - } + offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); + assert(offset == size); } } @@ -3453,18 +3409,180 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) // void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode) { - GenTree* srcAddr = putArgNode->gtGetOp1(); - assert(srcAddr->TypeGet() == TYP_STRUCT); + GenTree* src = putArgNode->gtGetOp1(); + assert(src->TypeGet() == TYP_STRUCT); + assert(!src->AsObj()->GetLayout()->HasGCPtr()); // Make sure we got the arguments of the cpblk operation in the right registers, and that - // 'srcAddr' is contained as expected. + // 'src' is contained as expected. assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI)); - assert(srcAddr->isContained()); + assert(src->isContained()); genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX); instGen(INS_r_movsb); } +#ifdef TARGET_X86 +//------------------------------------------------------------------------ +// genStructPutArgPush: Generates code for passing a struct arg by value on stack using "push". +// +// Arguments: +// putArgNode - the PutArgStk tree. +// +// Notes: +// Used only on x86, in two cases: +// - Structs 4, 8, or 12 bytes in size (less than XMM_REGSIZE_BYTES, multiple of TARGET_POINTER_SIZE). +// - Structs that contain GC pointers - they are guaranteed to be sized correctly by the VM. +// +void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode) +{ + // On x86, any struct that contains GC references must be stored to the stack using `push` instructions so + // that the emitter properly detects the need to update the method's GC information. + // + // Strictly speaking, it is only necessary to use "push" to store the GC references themselves, so for structs + // with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the + // future. + assert(m_pushStkArg); + + GenTree* src = putArgNode->Data(); + GenTree* srcAddr = putArgNode->Data()->AsObj()->Addr(); + + regNumber srcAddrReg = srcAddr->GetRegNum(); + const bool srcAddrInReg = srcAddrReg != REG_NA; + + unsigned srcLclNum = 0; + unsigned srcLclOffset = 0; + if (srcAddrInReg) + { + srcAddrReg = genConsumeReg(srcAddr); + } + else + { + assert(srcAddr->OperIsLocalAddr()); + + srcLclNum = srcAddr->AsLclVarCommon()->GetLclNum(); + srcLclOffset = srcAddr->AsLclVarCommon()->GetLclOffs(); + } + + ClassLayout* layout = src->AsObj()->GetLayout(); + const unsigned byteSize = putArgNode->GetStackByteSize(); + assert((byteSize % TARGET_POINTER_SIZE == 0) && ((byteSize < XMM_REGSIZE_BYTES) || layout->HasGCPtr())); + const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + assert(putArgNode->gtNumSlots == numSlots); + + for (int i = numSlots - 1; i >= 0; --i) + { + emitAttr slotAttr = emitTypeSize(layout->GetGCPtrType(i)); + const unsigned byteOffset = i * TARGET_POINTER_SIZE; + if (srcAddrInReg) + { + GetEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcAddrReg, byteOffset); + } + else + { + GetEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + byteOffset); + } + + AddStackLevel(TARGET_POINTER_SIZE); + } +} +#endif // TARGET_X86 + +#ifndef TARGET_X86 +//------------------------------------------------------------------------ +// genStructPutArgPartialRepMovs: Generates code for passing a struct arg by value on stack using +// a mix of pointer-sized stores, "movsq" and "rep movsd". +// +// Arguments: +// putArgNode - the PutArgStk tree. +// +// Notes: +// Used on non-x86 targets (Unix x64) for structs with GC pointers. +// +void CodeGen::genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgNode) +{ + // Consume these registers. + // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing"). + genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_NA); + + GenTreeObj* src = putArgNode->gtGetOp1()->AsObj(); + ClassLayout* layout = src->GetLayout(); + const bool srcIsLocal = src->Addr()->OperIsLocalAddr(); + const emitAttr srcAddrAttr = srcIsLocal ? EA_PTRSIZE : EA_BYREF; + +#if DEBUG + unsigned numGCSlotsCopied = 0; +#endif // DEBUG + + assert(layout->HasGCPtr()); + const unsigned byteSize = putArgNode->GetStackByteSize(); + assert(byteSize % TARGET_POINTER_SIZE == 0); + const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + assert(putArgNode->gtNumSlots == numSlots); + + // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always. + for (unsigned i = 0; i < numSlots;) + { + if (!layout->IsGCPtr(i)) + { + // Let's see if we can use rep movsp (alias for movsd or movsq for 32 and 64 bits respectively) + // instead of a sequence of movsp instructions to save cycles and code size. + unsigned adjacentNonGCSlotCount = 0; + do + { + adjacentNonGCSlotCount++; + i++; + } while ((i < numSlots) && !layout->IsGCPtr(i)); + + // If we have a very small contiguous non-ref region, it's better just to + // emit a sequence of movsp instructions + if (adjacentNonGCSlotCount < CPOBJ_NONGC_SLOTS_LIMIT) + { + for (; adjacentNonGCSlotCount > 0; adjacentNonGCSlotCount--) + { + instGen(INS_movsp); + } + } + else + { + GetEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, adjacentNonGCSlotCount); + instGen(INS_r_movsp); + } + } + else + { + // We have a GC (byref or ref) pointer + // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsp instruction, + // but the logic for emitting a GC info record is not available (it is internal for the emitter + // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do + // instGen(INS_movsp); and emission of gc info. + + var_types memType = layout->GetGCPtrType(i); + GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0); + genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE); +#ifdef DEBUG + numGCSlotsCopied++; +#endif // DEBUG + + i++; + if (i < numSlots) + { + // Source for the copy operation. + // If a LocalAddr, use EA_PTRSIZE - copy from stack. + // If not a LocalAddr, use EA_BYREF - the source location is not on the stack. + GetEmitter()->emitIns_R_I(INS_add, srcAddrAttr, REG_RSI, TARGET_POINTER_SIZE); + + // Always copying to the stack - outgoing arg area + // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE. + GetEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE); + } + } + } + + assert(numGCSlotsCopied == layout->GetGCPtrCount()); +} +#endif // !TARGET_X86 + //------------------------------------------------------------------------ // If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits // must be cleared to zeroes. The native compiler doesn't clear the upper bits @@ -7413,40 +7531,31 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) } #endif // FEATURE_SIMD - // If the gtPutArgStkKind is one of the push types, we do not pre-adjust the stack. - // This is set in Lowering, and is true if and only if: - // - This argument contains any GC pointers OR - // - It is a GT_FIELD_LIST OR - // - It is less than 16 bytes in size. - CLANG_FORMAT_COMMENT_ANCHOR; - #ifdef DEBUG switch (putArgStk->gtPutArgStkKind) { case GenTreePutArgStk::Kind::RepInstr: case GenTreePutArgStk::Kind::Unroll: - assert(!source->AsObj()->GetLayout()->HasGCPtr() && (argSize >= 16)); + assert(!source->AsObj()->GetLayout()->HasGCPtr()); break; + case GenTreePutArgStk::Kind::Push: case GenTreePutArgStk::Kind::PushAllSlots: - assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() || (argSize < 16)); + assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() || + (argSize < XMM_REGSIZE_BYTES)); break; - case GenTreePutArgStk::Kind::Invalid: + default: - assert(!"Uninitialized GenTreePutArgStk::Kind"); - break; + unreached(); } #endif // DEBUG - if (putArgStk->isPushKind()) - { - m_pushStkArg = true; - return false; - } - else + // In lowering (see "LowerPutArgStk") we have determined what sort of instructions + // are going to be used for this node. If we'll not be using "push"es, the stack + // needs to be adjusted first (s. t. the SP points to the base of the outgoing arg). + // + if (!putArgStk->isPushKind()) { - m_pushStkArg = false; - // If argSize is large, we need to probe the stack like we do in the prolog (genAllocLclFrame) // or for localloc (genLclHeap), to ensure we touch the stack pages sequentially, and don't miss // the stack guard pages. The prolog probes, but we don't know at this point how much higher @@ -7467,8 +7576,13 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) } AddStackLevel(argSize); + m_pushStkArg = false; return true; } + + // Otherwise, "push" will be adjusting the stack for us. + m_pushStkArg = true; + return false; } //--------------------------------------------------------------------- @@ -7685,10 +7799,6 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) // // Arguments // treeNode - the GT_PUTARG_STK node -// targetType - the type of the treeNode -// -// Return value: -// None // void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk) { @@ -7947,6 +8057,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset // corresponding to the argument area (where we will put the argument on the stack). // For tail calls this is the baseVarNum = 0. // For non tail calls this is the outgoingArgSpace. +// void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) { GenTree* source = putArgStk->gtGetOp1(); @@ -7972,151 +8083,30 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) ClassLayout* layout = source->AsObj()->GetLayout(); - if (!layout->HasGCPtr()) - { - switch (putArgStk->gtPutArgStkKind) - { - case GenTreePutArgStk::Kind::RepInstr: - genStructPutArgRepMovs(putArgStk); - break; - case GenTreePutArgStk::Kind::Unroll: - genStructPutArgUnroll(putArgStk); - break; - case GenTreePutArgStk::Kind::Push: - genStructPutArgUnroll(putArgStk); - break; - default: - unreached(); - } - } - else + switch (putArgStk->gtPutArgStkKind) { - // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always. - CLANG_FORMAT_COMMENT_ANCHOR; - -#ifdef TARGET_X86 - // On x86, any struct that has contains GC references must be stored to the stack using `push` instructions so - // that the emitter properly detects the need to update the method's GC information. - // - // Strictly speaking, it is only necessary to use `push` to store the GC references themselves, so for structs - // with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the - // future. - assert(m_pushStkArg); - - GenTree* srcAddr = source->gtGetOp1(); - const unsigned byteSize = putArgStk->GetStackByteSize(); - assert(byteSize % TARGET_POINTER_SIZE == 0); - const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; - assert(putArgStk->gtNumSlots == numSlots); - - regNumber srcRegNum = srcAddr->GetRegNum(); - const bool srcAddrInReg = srcRegNum != REG_NA; - - unsigned srcLclNum = 0; - unsigned srcLclOffset = 0; - if (srcAddrInReg) - { - genConsumeReg(srcAddr); - } - else - { - assert(srcAddr->OperIsLocalAddr()); - - srcLclNum = srcAddr->AsLclVarCommon()->GetLclNum(); - srcLclOffset = srcAddr->AsLclVarCommon()->GetLclOffs(); - } - - for (int i = numSlots - 1; i >= 0; --i) - { - emitAttr slotAttr = emitTypeSize(layout->GetGCPtrType(i)); - const unsigned byteOffset = i * TARGET_POINTER_SIZE; - if (srcAddrInReg) - { - GetEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcRegNum, byteOffset); - } - else - { - GetEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + byteOffset); - } - AddStackLevel(TARGET_POINTER_SIZE); - } -#else // !defined(TARGET_X86) - - // Consume these registers. - // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing"). - genConsumePutStructArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA); - - const bool srcIsLocal = putArgStk->gtOp1->AsObj()->gtOp1->OperIsLocalAddr(); - const emitAttr srcAddrAttr = srcIsLocal ? EA_PTRSIZE : EA_BYREF; - -#if DEBUG - unsigned numGCSlotsCopied = 0; -#endif // DEBUG - - const unsigned byteSize = putArgStk->GetStackByteSize(); - assert(byteSize % TARGET_POINTER_SIZE == 0); - const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; - assert(putArgStk->gtNumSlots == numSlots); - for (unsigned i = 0; i < numSlots;) - { - if (!layout->IsGCPtr(i)) - { - // Let's see if we can use rep movsp (alias for movsd or movsq for 32 and 64 bits respectively) - // instead of a sequence of movsp instructions to save cycles and code size. - unsigned adjacentNonGCSlotCount = 0; - do - { - adjacentNonGCSlotCount++; - i++; - } while ((i < numSlots) && !layout->IsGCPtr(i)); + case GenTreePutArgStk::Kind::RepInstr: + genStructPutArgRepMovs(putArgStk); + break; - // If we have a very small contiguous non-ref region, it's better just to - // emit a sequence of movsp instructions - if (adjacentNonGCSlotCount < CPOBJ_NONGC_SLOTS_LIMIT) - { - for (; adjacentNonGCSlotCount > 0; adjacentNonGCSlotCount--) - { - instGen(INS_movsp); - } - } - else - { - GetEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, adjacentNonGCSlotCount); - instGen(INS_r_movsp); - } - } - else - { - // We have a GC (byref or ref) pointer - // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsp instruction, - // but the logic for emitting a GC info record is not available (it is internal for the emitter - // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do - // instGen(INS_movsp); and emission of gc info. - - var_types memType = layout->GetGCPtrType(i); - GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0); - genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE); -#ifdef DEBUG - numGCSlotsCopied++; -#endif // DEBUG +#ifndef TARGET_X86 + case GenTreePutArgStk::Kind::PartialRepInstr: + genStructPutArgPartialRepMovs(putArgStk); + break; +#endif // !TARGET_X86 - i++; - if (i < numSlots) - { - // Source for the copy operation. - // If a LocalAddr, use EA_PTRSIZE - copy from stack. - // If not a LocalAddr, use EA_BYREF - the source location is not on the stack. - GetEmitter()->emitIns_R_I(INS_add, srcAddrAttr, REG_RSI, TARGET_POINTER_SIZE); - - // Always copying to the stack - outgoing arg area - // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE. - GetEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE); - } - } - } + case GenTreePutArgStk::Kind::Unroll: + genStructPutArgUnroll(putArgStk); + break; - assert(numGCSlotsCopied == layout->GetGCPtrCount()); +#ifdef TARGET_X86 + case GenTreePutArgStk::Kind::Push: + genStructPutArgPush(putArgStk); + break; #endif // TARGET_X86 + + default: + unreached(); } } #endif // defined(FEATURE_PUT_STRUCT_ARG_STK) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8e1dd02c2c2f24..c6277882f52758 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -10758,6 +10758,9 @@ void Compiler::gtDispTree(GenTree* tree, case GenTreePutArgStk::Kind::RepInstr: printf(" (RepInstr)"); break; + case GenTreePutArgStk::Kind::PartialRepInstr: + printf(" (PartialRepInstr)"); + break; case GenTreePutArgStk::Kind::Unroll: printf(" (Unroll)"); break; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 5d37fb36562653..8fddf9bb623c2f 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6775,7 +6775,7 @@ struct GenTreePutArgStk : public GenTreeUnOp // block node. enum class Kind : __int8{ - Invalid, RepInstr, Unroll, Push, PushAllSlots, + Invalid, RepInstr, PartialRepInstr, Unroll, Push, PushAllSlots, }; Kind gtPutArgStkKind; #endif @@ -6825,6 +6825,11 @@ struct GenTreePutArgStk : public GenTreeUnOp #endif } + GenTree*& Data() + { + return gtOp1; + } + #if FEATURE_FASTTAILCALL bool putInIncomingArgArea() const { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 46b48c4a8a9393..2bf009026bd895 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -600,31 +600,41 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // TODO-X86-CQ: The helper call either is not supported on x86 or required more work // (I don't know which). - if (size <= CPBLK_UNROLL_LIMIT && !layout->HasGCPtr()) + if (!layout->HasGCPtr()) { #ifdef TARGET_X86 if (size < XMM_REGSIZE_BYTES) { + // Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE + // chunks. As such, the correctness of this code depends on the fact that + // morph will copy any "mis-sized" (too small) non-local OBJs into a temp, + // thus preventing any possible out-of-bounds memory reads. + assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() || + (src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr())); putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; } else #endif // TARGET_X86 + if (size <= CPBLK_UNROLL_LIMIT) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; } + else + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; + } } -#ifdef TARGET_X86 - else if (layout->HasGCPtr()) + else // There are GC pointers. { +#ifdef TARGET_X86 // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions. putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; +#else // !TARGET_X86 + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr; +#endif // !TARGET_X86 } -#endif // TARGET_X86 - else - { - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; - } + // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree. MakeSrcContained(putArgStk, src); if (haveLocalAddr) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 715c325eacf8de..f9e69773d70cbb 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1576,41 +1576,40 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) return BuildSimple(putArgStk); } - ClassLayout* layout = src->AsObj()->GetLayout(); - ssize_t size = putArgStk->GetStackByteSize(); switch (putArgStk->gtPutArgStkKind) { - case GenTreePutArgStk::Kind::Push: - case GenTreePutArgStk::Kind::PushAllSlots: case GenTreePutArgStk::Kind::Unroll: // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. - if (!layout->HasGCPtr() && (size & (XMM_REGSIZE_BYTES - 1)) != 0) + if ((size % XMM_REGSIZE_BYTES) != 0) { regMaskTP regMask = allRegs(TYP_INT); buildInternalIntRegisterDefForNode(putArgStk, regMask); } -#ifdef TARGET_X86 - if (size >= 8) -#else // !TARGET_X86 if (size >= XMM_REGSIZE_BYTES) -#endif // !TARGET_X86 { - // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux, - // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a - // series of 16-byte loads and stores. + // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES, reserve + // an XMM register to use it for a series of 16-byte loads and stores. buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); SetContainsAVXFlags(); } break; case GenTreePutArgStk::Kind::RepInstr: +#ifndef TARGET_X86 + case GenTreePutArgStk::Kind::PartialRepInstr: +#endif buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI); buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX); buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI); break; +#ifdef TARGET_X86 + case GenTreePutArgStk::Kind::Push: + break; +#endif // TARGET_X86 + default: unreached(); }