diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 574f1756cc733..fd049d1a57860 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -59,6 +59,7 @@ add_llvm_target(RISCVCodeGen
   RISCVTargetObjectFile.cpp
   RISCVTargetTransformInfo.cpp
   RISCVVectorPeephole.cpp
+  RISCVVLOptimizer.cpp
   RISCVZacasABIFix.cpp
   GISel/RISCVCallLowering.cpp
   GISel/RISCVInstructionSelector.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 482c0cce78b10..d7bab601d545c 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -102,6 +102,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
 
 FunctionPass *createRISCVPreLegalizerCombiner();
 void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
+
+FunctionPass *createRISCVVLOptimizerPass();
+void initializeRISCVVLOptimizerPass(PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 6a72857b93b6c..d8e068e842779 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -103,6 +103,11 @@ static cl::opt<bool> EnableVSETVLIAfterRVVRegAlloc(
     cl::desc("Insert vsetvls after vector register allocation"),
     cl::init(true));
 
+static cl::opt<bool>
+    EnableVLOptimizer("riscv-enable-vl-optimizer",
+                      cl::desc("Enable the RISC-V VL Optimizer pass"),
+                      cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -551,8 +556,11 @@ void RISCVPassConfig::addMachineSSAOptimization() {
 
 void RISCVPassConfig::addPreRegAlloc() {
   addPass(createRISCVPreRAExpandPseudoPass());
-  if (TM->getOptLevel() != CodeGenOptLevel::None)
+  if (TM->getOptLevel() != CodeGenOptLevel::None) {
     addPass(createRISCVMergeBaseOffsetOptPass());
+    if (EnableVLOptimizer)
+      addPass(createRISCVVLOptimizerPass());
+  }
 
   addPass(createRISCVInsertReadWriteCSRPass());
   addPass(createRISCVInsertWriteVXRMPass());
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
new file mode 100644
index 0000000000000..90af9ef898d95
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -0,0 +1,829 @@
+//===-------------- RISCVVLOptimizer.cpp - VL Optimizer -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass reduces the VL where possible at the MI level, before VSETVLI
+// instructions are inserted.
+//
+// The purpose of this optimization is to make the VL argument, for instructions
+// that have a VL argument, as small as possible. This is implemented by
+// visiting each instruction in reverse order and checking that if it has a VL
+// argument, whether the VL can be reduced.
+//
+//===---------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-vl-optimizer"
+#define PASS_NAME "RISC-V VL Optimizer"
+
+namespace {
+
+class RISCVVLOptimizer : public MachineFunctionPass {
+  const MachineRegisterInfo *MRI;
+  const MachineDominatorTree *MDT;
+
+public:
+  static char ID;
+
+  RISCVVLOptimizer() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+private:
+  bool checkUsers(std::optional<Register> &CommonVL, MachineInstr &MI);
+  bool tryReduceVL(MachineInstr &MI);
+  bool isCandidate(const MachineInstr &MI) const;
+};
+
+} // end anonymous namespace
+
+char RISCVVLOptimizer::ID = 0;
+INITIALIZE_PASS_BEGIN(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(RISCVVLOptimizer, DEBUG_TYPE, PASS_NAME, false, false)
+
+FunctionPass *llvm::createRISCVVLOptimizerPass() {
+  return new RISCVVLOptimizer();
+}
+
+/// Return true if R is a physical or virtual vector register, false otherwise.
+static bool isVectorRegClass(Register R, const MachineRegisterInfo *MRI) {
+  if (R.isPhysical())
+    return RISCV::VRRegClass.contains(R);
+  const TargetRegisterClass *RC = MRI->getRegClass(R);
+  return RISCVRI::isVRegClass(RC->TSFlags);
+}
+
+/// Represents the EMUL and EEW of a MachineOperand.
+struct OperandInfo {
+  enum class State {
+    Unknown,
+    Known,
+  } S;
+
+  // Represent as 1,2,4,8, ... and fractional indicator. This is because
+  // EMUL can take on values that don't map to RISCVII::VLMUL values exactly.
+  // For example, a mask operand can have an EMUL less than MF8.
+  std::optional<std::pair<unsigned, bool>> EMUL;
+
+  unsigned Log2EEW;
+
+  OperandInfo(RISCVII::VLMUL EMUL, unsigned Log2EEW)
+      : S(State::Known), EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {
+  }
+
+  OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW)
+      : S(State::Known), EMUL(EMUL), Log2EEW(Log2EEW) {}
+
+  OperandInfo() : S(State::Unknown) {}
+
+  bool isUnknown() const { return S == State::Unknown; }
+  bool isKnown() const { return S == State::Known; }
+
+  static bool EMULAndEEWAreEqual(const OperandInfo &A, const OperandInfo &B) {
+    assert(A.isKnown() && B.isKnown() && "Both operands must be known");
+
+    return A.Log2EEW == B.Log2EEW && A.EMUL->first == B.EMUL->first &&
+           A.EMUL->second == B.EMUL->second;
+  }
+
+  void print(raw_ostream &OS) const {
+    if (isUnknown()) {
+      OS << "Unknown";
+      return;
+    }
+    assert(EMUL && "Expected EMUL to have value");
+    OS << "EMUL: ";
+    if (EMUL->second)
+      OS << "m";
+    OS << "f" << EMUL->first;
+    OS << ", EEW: " << (1 << Log2EEW);
+  }
+};
+
+static raw_ostream &operator<<(raw_ostream &OS, const OperandInfo &OI) {
+  OI.print(OS);
+  return OS;
+}
+
+namespace llvm {
+namespace RISCVVType {
+/// Return the RISCVII::VLMUL that is two times VLMul.
+/// Precondition: VLMul is not LMUL_RESERVED or LMUL_8.
+static RISCVII::VLMUL twoTimesVLMUL(RISCVII::VLMUL VLMul) {
+  switch (VLMul) {
+  case RISCVII::VLMUL::LMUL_F8:
+    return RISCVII::VLMUL::LMUL_F4;
+  case RISCVII::VLMUL::LMUL_F4:
+    return RISCVII::VLMUL::LMUL_F2;
+  case RISCVII::VLMUL::LMUL_F2:
+    return RISCVII::VLMUL::LMUL_1;
+  case RISCVII::VLMUL::LMUL_1:
+    return RISCVII::VLMUL::LMUL_2;
+  case RISCVII::VLMUL::LMUL_2:
+    return RISCVII::VLMUL::LMUL_4;
+  case RISCVII::VLMUL::LMUL_4:
+    return RISCVII::VLMUL::LMUL_8;
+  case RISCVII::VLMUL::LMUL_8:
+  default:
+    llvm_unreachable("Could not multiply VLMul by 2");
+  }
+}
+
+/// Return EMUL = (EEW / SEW) * LMUL where EEW comes from Log2EEW and LMUL and
+/// SEW are from the TSFlags of MI.
+static std::pair<unsigned, bool>
+getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
+  RISCVII::VLMUL MIVLMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
+  auto [MILMUL, MILMULIsFractional] = RISCVVType::decodeVLMUL(MIVLMUL);
+  unsigned MILog2SEW =
+      MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+  unsigned MISEW = 1 << MILog2SEW;
+
+  unsigned EEW = 1 << Log2EEW;
+  // Calculate (EEW/SEW)*LMUL preserving fractions less than 1. Use GCD
+  // to put fraction in simplest form.
+  unsigned Num = EEW, Denom = MISEW;
+  int GCD = MILMULIsFractional ? std::gcd(Num, Denom * MILMUL)
+                               : std::gcd(Num * MILMUL, Denom);
+  Num = MILMULIsFractional ? Num / GCD : Num * MILMUL / GCD;
+  Denom = MILMULIsFractional ? Denom * MILMUL / GCD : Denom / GCD;
+  return std::make_pair(Num > Denom ? Num : Denom, Denom > Num);
+}
+} // end namespace RISCVVType
+} // end namespace llvm
+
+/// Dest has EEW=SEW and EMUL=LMUL. Source EEW=SEW/Factor (i.e. F2 => EEW/2).
+/// Source has EMUL=(EEW/SEW)*LMUL. LMUL and SEW comes from TSFlags of MI.
+static OperandInfo getIntegerExtensionOperandInfo(unsigned Factor,
+                                                  const MachineInstr &MI,
+                                                  const MachineOperand &MO) {
+  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned MILog2SEW =
+      MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+
+  if (MO.getOperandNo() == 0)
+    return OperandInfo(MIVLMul, MILog2SEW);
+
+  unsigned MISEW = 1 << MILog2SEW;
+  unsigned EEW = MISEW / Factor;
+  unsigned Log2EEW = Log2_32(EEW);
+
+  return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(Log2EEW, MI),
+                     Log2EEW);
+}
+
+/// Check whether MO is a mask operand of MI.
+static bool isMaskOperand(const MachineInstr &MI, const MachineOperand &MO,
+                          const MachineRegisterInfo *MRI) {
+
+  if (!MO.isReg() || !isVectorRegClass(MO.getReg(), MRI))
+    return false;
+
+  const MCInstrDesc &Desc = MI.getDesc();
+  return Desc.operands()[MO.getOperandNo()].RegClass == RISCV::VMV0RegClassID;
+}
+
+/// Return the OperandInfo for MO, which is an operand of MI.
+static OperandInfo getOperandInfo(const MachineInstr &MI,
+                                  const MachineOperand &MO,
+                                  const MachineRegisterInfo *MRI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  assert(RVV && "Could not find MI in PseudoTable");
+
+  // MI has a VLMUL and SEW associated with it. The RVV specification defines
+  // the LMUL and SEW of each operand and definition in relation to MI.VLMUL and
+  // MI.SEW.
+  RISCVII::VLMUL MIVLMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  unsigned MILog2SEW =
+      MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
+
+  const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI.getDesc());
+
+  // We bail out early for instructions that have passthru with non NoRegister,
+  // which means they are using TU policy. We are not interested in these
+  // since they must preserve the entire register content.
+  if (HasPassthru && MO.getOperandNo() == MI.getNumExplicitDefs() &&
+      (MO.getReg() != RISCV::NoRegister))
+    return {};
+
+  bool IsMODef = MO.getOperandNo() == 0;
+
+  // All mask operands have EEW=1, EMUL=(EEW/SEW)*LMUL
+  if (isMaskOperand(MI, MO, MRI))
+    return OperandInfo(RISCVVType::getEMULEqualsEEWDivSEWTimesLMUL(0, MI), 0);
+
+  // switch against BaseInstr to reduce number of cases that need to be
+  // considered.
+  switch (RVV->BaseInstr) {
+
+  // 6. Configuration-Setting Instructions
+  // Configuration setting instructions do not read or write vector registers
+  case RISCV::VSETIVLI:
+  case RISCV::VSETVL:
+  case RISCV::VSETVLI:
+    llvm_unreachable("Configuration setting instructions do not read or write "
+                     "vector registers");
+
+  // 11. Vector Integer Arithmetic Instructions
+  // 11.1. Vector Single-Width Integer Add and Subtract
+  case RISCV::VADD_VI:
+  case RISCV::VADD_VV:
+  case RISCV::VADD_VX:
+  case RISCV::VSUB_VV:
+  case RISCV::VSUB_VX:
+  case RISCV::VRSUB_VI:
+  case RISCV::VRSUB_VX:
+  // 11.5. Vector Bitwise Logical Instructions
+  // 11.6. Vector Single-Width Shift Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VAND_VI:
+  case RISCV::VAND_VV:
+  case RISCV::VAND_VX:
+  case RISCV::VOR_VI:
+  case RISCV::VOR_VV:
+  case RISCV::VOR_VX:
+  case RISCV::VXOR_VI:
+  case RISCV::VXOR_VV:
+  case RISCV::VXOR_VX:
+  case RISCV::VSLL_VI:
+  case RISCV::VSLL_VV:
+  case RISCV::VSLL_VX:
+  case RISCV::VSRL_VI:
+  case RISCV::VSRL_VV:
+  case RISCV::VSRL_VX:
+  case RISCV::VSRA_VI:
+  case RISCV::VSRA_VV:
+  case RISCV::VSRA_VX:
+  // 11.9. Vector Integer Min/Max Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VMINU_VV:
+  case RISCV::VMINU_VX:
+  case RISCV::VMIN_VV:
+  case RISCV::VMIN_VX:
+  case RISCV::VMAXU_VV:
+  case RISCV::VMAXU_VX:
+  case RISCV::VMAX_VV:
+  case RISCV::VMAX_VX:
+  // 11.10. Vector Single-Width Integer Multiply Instructions
+  // Source and Dest EEW=SEW and EMUL=LMUL.
+  case RISCV::VMUL_VV:
+  case RISCV::VMUL_VX:
+  case RISCV::VMULH_VV:
+  case RISCV::VMULH_VX:
+  case RISCV::VMULHU_VV:
+  case RISCV::VMULHU_VX:
+  case RISCV::VMULHSU_VV:
+  case RISCV::VMULHSU_VX:
+  // 11.11. Vector Integer Divide Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VDIVU_VV:
+  case RISCV::VDIVU_VX:
+  case RISCV::VDIV_VV:
+  case RISCV::VDIV_VX:
+  case RISCV::VREMU_VV:
+  case RISCV::VREMU_VX:
+  case RISCV::VREM_VV:
+  case RISCV::VREM_VX:
+  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VMACC_VV:
+  case RISCV::VMACC_VX:
+  case RISCV::VNMSAC_VV:
+  case RISCV::VNMSAC_VX:
+  case RISCV::VMADD_VV:
+  case RISCV::VMADD_VX:
+  case RISCV::VNMSUB_VV:
+  case RISCV::VNMSUB_VX:
+  // 11.15. Vector Integer Merge Instructions
+  // EEW=SEW and EMUL=LMUL, except the mask operand has EEW=1 and EMUL=
+  // (EEW/SEW)*LMUL. Mask operand is handled before this switch.
+  case RISCV::VMERGE_VIM:
+  case RISCV::VMERGE_VVM:
+  case RISCV::VMERGE_VXM:
+  // 11.16. Vector Integer Move Instructions
+  // 12. Vector Fixed-Point Arithmetic Instructions
+  // 12.1. Vector Single-Width Saturating Add and Subtract
+  // 12.2. Vector Single-Width Averaging Add and Subtract
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VMV_V_I:
+  case RISCV::VMV_V_V:
+  case RISCV::VMV_V_X:
+  case RISCV::VSADDU_VI:
+  case RISCV::VSADDU_VV:
+  case RISCV::VSADDU_VX:
+  case RISCV::VSADD_VI:
+  case RISCV::VSADD_VV:
+  case RISCV::VSADD_VX:
+  case RISCV::VSSUBU_VV:
+  case RISCV::VSSUBU_VX:
+  case RISCV::VSSUB_VV:
+  case RISCV::VSSUB_VX:
+  case RISCV::VAADDU_VV:
+  case RISCV::VAADDU_VX:
+  case RISCV::VAADD_VV:
+  case RISCV::VAADD_VX:
+  case RISCV::VASUBU_VV:
+  case RISCV::VASUBU_VX:
+  case RISCV::VASUB_VV:
+  case RISCV::VASUB_VX:
+  // 12.4. Vector Single-Width Scaling Shift Instructions
+  // EEW=SEW. EMUL=LMUL.
+  case RISCV::VSSRL_VI:
+  case RISCV::VSSRL_VV:
+  case RISCV::VSSRL_VX:
+  case RISCV::VSSRA_VI:
+  case RISCV::VSSRA_VV:
+  case RISCV::VSSRA_VX:
+  // 16. Vector Permutation Instructions
+  // 16.1. Integer Scalar Move Instructions
+  // 16.2. Floating-Point Scalar Move Instructions
+  // EMUL=LMUL. EEW=SEW.
+  case RISCV::VMV_X_S:
+  case RISCV::VMV_S_X:
+  case RISCV::VFMV_F_S:
+  case RISCV::VFMV_S_F:
+  // 16.3. Vector Slide Instructions
+  // EMUL=LMUL. EEW=SEW.
+  case RISCV::VSLIDEUP_VI:
+  case RISCV::VSLIDEUP_VX:
+  case RISCV::VSLIDEDOWN_VI:
+  case RISCV::VSLIDEDOWN_VX:
+  case RISCV::VSLIDE1UP_VX:
+  case RISCV::VFSLIDE1UP_VF:
+  case RISCV::VSLIDE1DOWN_VX:
+  case RISCV::VFSLIDE1DOWN_VF:
+  // 16.4. Vector Register Gather Instructions
+  // EMUL=LMUL. EEW=SEW. For mask operand, EMUL=1 and EEW=1.
+  case RISCV::VRGATHER_VI:
+  case RISCV::VRGATHER_VV:
+  case RISCV::VRGATHER_VX:
+  // 16.5. Vector Compress Instruction
+  // EMUL=LMUL. EEW=SEW.
+  case RISCV::VCOMPRESS_VM:
+    return OperandInfo(MIVLMul, MILog2SEW);
+
+  // 11.2. Vector Widening Integer Add/Subtract
+  // Def uses EEW=2*SEW and EMUL=2*LMUL. Operands use EEW=SEW and EMUL=LMUL.
+  case RISCV::VWADDU_VV:
+  case RISCV::VWADDU_VX:
+  case RISCV::VWSUBU_VV:
+  case RISCV::VWSUBU_VX:
+  case RISCV::VWADD_VV:
+  case RISCV::VWADD_VX:
+  case RISCV::VWSUB_VV:
+  case RISCV::VWSUB_VX:
+  case RISCV::VWSLL_VI:
+  // 11.12. Vector Widening Integer Multiply Instructions
+  // Source and Destination EMUL=LMUL. Destination EEW=2*SEW. Source EEW=SEW.
+  case RISCV::VWMUL_VV:
+  case RISCV::VWMUL_VX:
+  case RISCV::VWMULSU_VV:
+  case RISCV::VWMULSU_VX:
+  case RISCV::VWMULU_VV:
+  case RISCV::VWMULU_VX: {
+    unsigned Log2EEW = IsMODef ? MILog2SEW + 1 : MILog2SEW;
+    RISCVII::VLMUL EMUL =
+        IsMODef ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
+    return OperandInfo(EMUL, Log2EEW);
+  }
+
+  // Def and Op1 uses EEW=2*SEW and EMUL=2*LMUL. Op2 uses EEW=SEW and EMUL=LMUL
+  case RISCV::VWADDU_WV:
+  case RISCV::VWADDU_WX:
+  case RISCV::VWSUBU_WV:
+  case RISCV::VWSUBU_WX:
+  case RISCV::VWADD_WV:
+  case RISCV::VWADD_WX:
+  case RISCV::VWSUB_WV:
+  case RISCV::VWSUB_WX:
+  // 11.14. Vector Widening Integer Multiply-Add Instructions
+  // Destination EEW=2*SEW and EMUL=2*LMUL. Source EEW=SEW and EMUL=LMUL.
+  // Even though the add is a 2*SEW addition, the operands of the add are the
+  // Dest which is 2*SEW and the result of the multiply which is 2*SEW.
+  case RISCV::VWMACCU_VV:
+  case RISCV::VWMACCU_VX:
+  case RISCV::VWMACC_VV:
+  case RISCV::VWMACC_VX:
+  case RISCV::VWMACCSU_VV:
+  case RISCV::VWMACCSU_VX:
+  case RISCV::VWMACCUS_VX: {
+    bool IsOp1 = HasPassthru ? MO.getOperandNo() == 1 : MO.getOperandNo() == 2;
+    bool TwoTimes = IsMODef || IsOp1;
+    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
+    RISCVII::VLMUL EMUL =
+        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
+    return OperandInfo(EMUL, Log2EEW);
+  }
+
+  // 11.3. Vector Integer Extension
+  case RISCV::VZEXT_VF2:
+  case RISCV::VSEXT_VF2:
+    return getIntegerExtensionOperandInfo(2, MI, MO);
+  case RISCV::VZEXT_VF4:
+  case RISCV::VSEXT_VF4:
+    return getIntegerExtensionOperandInfo(4, MI, MO);
+  case RISCV::VZEXT_VF8:
+  case RISCV::VSEXT_VF8:
+    return getIntegerExtensionOperandInfo(8, MI, MO);
+
+  // 11.7. Vector Narrowing Integer Right Shift Instructions
+  // Destination EEW=SEW and EMUL=LMUL, Op 1 has EEW=2*SEW EMUL=2*LMUL. Op2 has
+  // EEW=SEW EMUL=LMUL.
+  case RISCV::VNSRL_WX:
+  case RISCV::VNSRL_WI:
+  case RISCV::VNSRL_WV:
+  case RISCV::VNSRA_WI:
+  case RISCV::VNSRA_WV:
+  case RISCV::VNSRA_WX:
+  // 12.5. Vector Narrowing Fixed-Point Clip Instructions
+  // Destination and Op1 EEW=SEW and EMUL=LMUL. Op2 EEW=2*SEW and EMUL=2*LMUL
+  case RISCV::VNCLIPU_WI:
+  case RISCV::VNCLIPU_WV:
+  case RISCV::VNCLIPU_WX:
+  case RISCV::VNCLIP_WI:
+  case RISCV::VNCLIP_WV:
+  case RISCV::VNCLIP_WX: {
+    bool IsOp1 = HasPassthru ? MO.getOperandNo() == 1 : MO.getOperandNo() == 2;
+    bool TwoTimes = IsOp1;
+    unsigned Log2EEW = TwoTimes ? MILog2SEW + 1 : MILog2SEW;
+    RISCVII::VLMUL EMUL =
+        TwoTimes ? RISCVVType::twoTimesVLMUL(MIVLMul) : MIVLMul;
+    return OperandInfo(EMUL, Log2EEW);
+  }
+
+  default:
+    return {};
+  }
+}
+
+/// Return true if this optimization should consider MI for VL reduction. This
+/// white-list approach simplifies this optimization for instructions that may
+/// have more complex semantics with relation to how it uses VL.
+static bool isSupportedInstr(const MachineInstr &MI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+
+  if (!RVV)
+    return false;
+
+  switch (RVV->BaseInstr) {
+  // 11.1. Vector Single-Width Integer Add and Subtract
+  case RISCV::VADD_VI:
+  case RISCV::VADD_VV:
+  case RISCV::VADD_VX:
+  case RISCV::VSUB_VV:
+  case RISCV::VSUB_VX:
+  case RISCV::VRSUB_VI:
+  case RISCV::VRSUB_VX:
+  // 11.2. Vector Widening Integer Add/Subtract
+  case RISCV::VWADDU_VV:
+  case RISCV::VWADDU_VX:
+  case RISCV::VWSUBU_VV:
+  case RISCV::VWSUBU_VX:
+  case RISCV::VWADD_VV:
+  case RISCV::VWADD_VX:
+  case RISCV::VWSUB_VV:
+  case RISCV::VWSUB_VX:
+  case RISCV::VWADDU_WV:
+  case RISCV::VWADDU_WX:
+  case RISCV::VWSUBU_WV:
+  case RISCV::VWSUBU_WX:
+  case RISCV::VWADD_WV:
+  case RISCV::VWADD_WX:
+  case RISCV::VWSUB_WV:
+  case RISCV::VWSUB_WX:
+  // 11.3. Vector Integer Extension
+  case RISCV::VZEXT_VF2:
+  case RISCV::VSEXT_VF2:
+  case RISCV::VZEXT_VF4:
+  case RISCV::VSEXT_VF4:
+  case RISCV::VZEXT_VF8:
+  case RISCV::VSEXT_VF8:
+  // 11.4. Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+  // FIXME: Add support for 11.4 instructions
+  // 11.5. Vector Bitwise Logical Instructions
+  // FIXME: Add support for 11.5 instructions
+  // 11.6. Vector Single-Width Shift Instructions
+  // FIXME: Add support for 11.6 instructions
+  case RISCV::VSLL_VI:
+  // 11.7. Vector Narrowing Integer Right Shift Instructions
+  // FIXME: Add support for 11.7 instructions
+  case RISCV::VNSRL_WI:
+  // 11.8 Vector Integer Compare Instructions
+  // FIXME: Add support for 11.8 instructions
+  // 11.9. Vector Integer Min/Max Instructions
+  // FIXME: Add support for 11.9 instructions
+  // 11.10. Vector Single-Width Integer Multiply Instructions
+  case RISCV::VMUL_VV:
+  case RISCV::VMUL_VX:
+  case RISCV::VMULH_VV:
+  case RISCV::VMULH_VX:
+  case RISCV::VMULHU_VV:
+  case RISCV::VMULHU_VX:
+  case RISCV::VMULHSU_VV:
+  case RISCV::VMULHSU_VX:
+  // 11.11. Vector Integer Divide Instructions
+  // FIXME: Add support for 11.11 instructions
+  // 11.12. Vector Widening Integer Multiply Instructions
+  // FIXME: Add support for 11.12 instructions
+  // 11.13. Vector Single-Width Integer Multiply-Add Instructions
+  // FIXME: Add support for 11.13 instructions
+  // 11.14. Vector Widening Integer Multiply-Add Instructions
+  // FIXME: Add support for 11.14 instructions
+  case RISCV::VWMACC_VX:
+  case RISCV::VWMACCU_VX:
+  // 11.15. Vector Integer Merge Instructions
+  // FIXME: Add support for 11.15 instructions
+  // 11.16. Vector Integer Move Instructions
+  // FIXME: Add support for 11.16 instructions
+  case RISCV::VMV_V_I:
+  case RISCV::VMV_V_X:
+
+  // Vector Crypto
+  case RISCV::VWSLL_VI:
+    return true;
+  }
+
+  return false;
+}
+
+/// Return true if MO is a vector operand but is used as a scalar operand.
+static bool isVectorOpUsedAsScalarOp(MachineOperand &MO) {
+  MachineInstr *MI = MO.getParent();
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI->getOpcode());
+
+  if (!RVV)
+    return false;
+
+  switch (RVV->BaseInstr) {
+  // Reductions only use vs1[0] of vs1
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDXOR_VS:
+  case RISCV::VWREDSUM_VS:
+  case RISCV::VWREDSUMU_VS:
+  case RISCV::VFREDMAX_VS:
+  case RISCV::VFREDMIN_VS:
+  case RISCV::VFREDOSUM_VS:
+  case RISCV::VFREDUSUM_VS:
+  case RISCV::VFWREDOSUM_VS:
+  case RISCV::VFWREDUSUM_VS: {
+    bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MI->getDesc());
+    return HasPassthru ? MO.getOperandNo() == 2 : MO.getOperandNo() == 3;
+  }
+  default:
+    return false;
+  }
+}
+
+/// Return true if MI may read elements past VL.
+static bool mayReadPastVL(const MachineInstr &MI) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  if (!RVV)
+    return true;
+
+  switch (RVV->BaseInstr) {
+  // vslidedown instructions may read elements past VL. They are handled
+  // according to current tail policy.
+  case RISCV::VSLIDEDOWN_VI:
+  case RISCV::VSLIDEDOWN_VX:
+  case RISCV::VSLIDE1DOWN_VX:
+  case RISCV::VFSLIDE1DOWN_VF:
+
+  // vrgather instructions may read the source vector at any index < VLMAX,
+  // regardless of VL.
+  case RISCV::VRGATHER_VI:
+  case RISCV::VRGATHER_VV:
+  case RISCV::VRGATHER_VX:
+  case RISCV::VRGATHEREI16_VV:
+    return true;
+
+  default:
+    return false;
+  }
+}
+
+bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
+  const MCInstrDesc &Desc = MI.getDesc();
+  if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags))
+    return false;
+  if (MI.getNumDefs() != 1)
+    return false;
+
+  unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+  const MachineOperand &VLOp = MI.getOperand(VLOpNum);
+  if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
+    return false;
+
+  // Some instructions that produce vectors have semantics that make it more
+  // difficult to determine whether the VL can be reduced. For example, some
+  // instructions, such as reductions, may write lanes past VL to a scalar
+  // register. Other instructions, such as some loads or stores, may write
+  // lower lanes using data from higher lanes. There may be other complex
+  // semantics not mentioned here that make it hard to determine whether
+  // the VL can be optimized. As a result, a white-list of supported
+  // instructions is used. Over time, more instructions cam be supported
+  // upon careful examination of their semantics under the logic in this
+  // optimization.
+  // TODO: Use a better approach than a white-list, such as adding
+  // properties to instructions using something like TSFlags.
+  if (!isSupportedInstr(MI)) {
+    LLVM_DEBUG(dbgs() << "Not a candidate due to unsupported instruction\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Found a candidate for VL reduction: " << MI << "\n");
+  return true;
+}
+
+bool RISCVVLOptimizer::checkUsers(std::optional<Register> &CommonVL,
+                                  MachineInstr &MI) {
+  // FIXME: Avoid visiting each user for each time we visit something on the
+  // worklist, combined with an extra visit from the outer loop. Restructure
+  // along lines of an instcombine style worklist which integrates the outer
+  // pass.
+  bool CanReduceVL = true;
+  for (auto &UserOp : MRI->use_operands(MI.getOperand(0).getReg())) {
+    const MachineInstr &UserMI = *UserOp.getParent();
+    LLVM_DEBUG(dbgs() << "  Checking user: " << UserMI << "\n");
+
+    // Instructions like reductions may use a vector register as a scalar
+    // register. In this case, we should treat it like a scalar register which
+    // does not impact the decision on whether to optimize VL.
+    if (isVectorOpUsedAsScalarOp(UserOp)) {
+      [[maybe_unused]] Register R = UserOp.getReg();
+      [[maybe_unused]] const TargetRegisterClass *RC = MRI->getRegClass(R);
+      assert(RISCV::VRRegClass.hasSubClassEq(RC) &&
+             "Expect LMUL 1 register class for vector as scalar operands!");
+      LLVM_DEBUG(dbgs() << "    Use this operand as a scalar operand\n");
+      continue;
+    }
+
+    if (mayReadPastVL(UserMI)) {
+      LLVM_DEBUG(dbgs() << "    Abort because used by unsafe instruction\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    // Tied operands might pass through.
+    if (UserOp.isTied()) {
+      LLVM_DEBUG(dbgs() << "    Abort because user used as tied operand\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    const MCInstrDesc &Desc = UserMI.getDesc();
+    if (!RISCVII::hasVLOp(Desc.TSFlags) || !RISCVII::hasSEWOp(Desc.TSFlags)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to lack of VL or SEW, assume that"
+                           " use VLMAX\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
+    const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
+    // Looking for a register VL that isn't X0.
+    if (!VLOp.isReg() || VLOp.getReg() == RISCV::X0) {
+      LLVM_DEBUG(dbgs() << "    Abort due to user uses X0 as VL.\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    if (!CommonVL) {
+      CommonVL = VLOp.getReg();
+    } else if (*CommonVL != VLOp.getReg()) {
+      LLVM_DEBUG(dbgs() << "    Abort because users have different VL\n");
+      CanReduceVL = false;
+      break;
+    }
+
+    // The SEW and LMUL of destination and source registers need to match.
+
+    // We know that MI DEF is a vector register, because that was the guard
+    // to call this function.
+    assert(isVectorRegClass(UserMI.getOperand(0).getReg(), MRI) &&
+           "Expected DEF and USE to be vector registers");
+
+    OperandInfo ConsumerInfo = getOperandInfo(UserMI, UserOp, MRI);
+    OperandInfo ProducerInfo = getOperandInfo(MI, MI.getOperand(0), MRI);
+    if (ConsumerInfo.isUnknown() || ProducerInfo.isUnknown() ||
+        !OperandInfo::EMULAndEEWAreEqual(ConsumerInfo, ProducerInfo)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to incompatible or unknown "
+                           "information for EMUL or EEW.\n");
+      LLVM_DEBUG(dbgs() << "      ConsumerInfo is: " << ConsumerInfo << "\n");
+      LLVM_DEBUG(dbgs() << "      ProducerInfo is: " << ProducerInfo << "\n");
+      CanReduceVL = false;
+      break;
+    }
+  }
+  return CanReduceVL;
+}
+
+bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
+  SetVector<MachineInstr *> Worklist;
+  Worklist.insert(&OrigMI);
+
+  bool MadeChange = false;
+  while (!Worklist.empty()) {
+    MachineInstr &MI = *Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
+
+    std::optional<Register> CommonVL;
+    bool CanReduceVL = true;
+    if (isVectorRegClass(MI.getOperand(0).getReg(), MRI))
+      CanReduceVL = checkUsers(CommonVL, MI);
+
+    if (!CanReduceVL || !CommonVL)
+      continue;
+
+    if (!CommonVL->isVirtual()) {
+      LLVM_DEBUG(
+          dbgs() << "    Abort due to new VL is not virtual register.\n");
+      continue;
+    }
+
+    const MachineInstr *VLMI = MRI->getVRegDef(*CommonVL);
+    if (!MDT->dominates(VLMI, &MI))
+      continue;
+
+    // All our checks passed. We can reduce VL.
+    LLVM_DEBUG(dbgs() << "    Reducing VL for: " << MI << "\n");
+    unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
+    MachineOperand &VLOp = MI.getOperand(VLOpNum);
+    VLOp.ChangeToRegister(*CommonVL, false);
+    MadeChange = true;
+
+    // Now add all inputs to this instruction to the worklist.
+    for (auto &Op : MI.operands()) {
+      if (!Op.isReg() || !Op.isUse() || !Op.getReg().isVirtual())
+        continue;
+
+      if (!isVectorRegClass(Op.getReg(), MRI))
+        continue;
+
+      MachineInstr *DefMI = MRI->getVRegDef(Op.getReg());
+
+      if (!isCandidate(*DefMI))
+        continue;
+
+      Worklist.insert(DefMI);
+    }
+  }
+
+  return MadeChange;
+}
+
+bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+  if (!ST.hasVInstructions())
+    return false;
+
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    // Visit instructions in reverse order.
+    for (auto &MI : make_range(MBB.rbegin(), MBB.rend())) {
+      if (!isCandidate(MI))
+        continue;
+
+      MadeChange |= tryReduceVL(MI);
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
new file mode 100644
index 0000000000000..107252338829b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -0,0 +1,973 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb -riscv-enable-vl-optimizer=false -verify-machineinstrs | FileCheck %s --check-prefixes=NOVLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvbb -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvbb -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s --check-prefixes=VLOPT
+
+; The purpose of this file is to check the behavior of specific instructions as it relates to the VL optimizer
+
+define <vscale x 4 x i32> @vadd_vi(<vscale x 4 x i32> %a, iXLen %vl) {
+; NOVLOPT-LABEL: vadd_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vi v10, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vadd_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vi v10, v8, 5
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vadd_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vadd_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vadd_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vadd_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vx v10, v8, a0
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsub_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsub_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsub.vv v8, v8, v10
+; VLOPT-NEXT:    vsub.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsub_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsub_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vsub.vx v10, v8, a0
+; VLOPT-NEXT:    vsub.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrsub_vi(<vscale x 4 x i32> %a, iXLen %vl) {
+; NOVLOPT-LABEL: vrsub_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrsub.vi v10, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrsub_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vrsub.vi v10, v8, 5
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vrsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vrsub_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vrsub.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsub.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vrsub_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vrsub.vx v10, v8, a0
+; VLOPT-NEXT:    vsub.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vrsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsub.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwaddu_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwaddu_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwaddu.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwaddu_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwaddu.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwaddu.w.xv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsubu_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsubu_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsubu.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsubu_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsubu.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsubu.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwadd_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwadd_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwadd.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwadd_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwadd.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwadd.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_wv(<vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_wv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.wv v8, v8, v12
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_wv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.wv v8, v8, v12
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwsub_wx(<vscale x 4 x i64> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsub_wx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwsub.wx v8, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsub_wx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwsub.wx v8, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v8, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwsub.w.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i32> @vsext_vf2(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsext_vf2:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsext.vf2 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsext_vf2:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsext.vf2 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vsext_vf4(<vscale x 4 x i8> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsext_vf4:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsext.vf4 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsext_vf4:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsext.vf4 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsext.nxv4i32.nxv4i8(<vscale x 4 x i32> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i64> @vsext_vf8(<vscale x 4 x i8> %a, <vscale x 4 x i64> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vsext_vf8:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vsext.vf8 v16, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v16, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsext_vf8:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; VLOPT-NEXT:    vsext.vf8 v16, v8
+; VLOPT-NEXT:    vadd.vv v8, v16, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vsext.nxv4i32.nxv4i8(<vscale x 4 x i64> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %b, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i32> @vzext_vf2(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vzext_vf2:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vzext.vf2 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vzext_vf2:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vzext.vf2 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vzext_vf4(<vscale x 4 x i8> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vzext_vf4:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vzext.vf4 v12, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vzext_vf4:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vzext.vf4 v12, v8
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vzext.nxv4i32.nxv4i8(<vscale x 4 x i32> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i64> @vzext_vf8(<vscale x 4 x i8> %a, <vscale x 4 x i64> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vzext_vf8:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vzext.vf8 v16, v8
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v16, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vzext_vf8:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; VLOPT-NEXT:    vzext.vf8 v16, v8
+; VLOPT-NEXT:    vadd.vv v8, v16, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vzext.nxv4i32.nxv4i8(<vscale x 4 x i64> poison, <vscale x 4 x i8> %a, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %b, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i32> @vsll_vi(<vscale x 4 x i32> %a, iXLen %vl) {
+; NOVLOPT-LABEL: vsll_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsll.vi v10, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vsll.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vsll_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vsll.vi v10, v8, 5
+; VLOPT-NEXT:    vsll.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vsll.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i16> @vnsrl_wi(<vscale x 4 x i32> %a, <vscale x 4 x i16> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vnsrl_wi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vnsrl.wi v11, v8, 5
+; NOVLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v11, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vnsrl_wi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT:    vnsrl.wi v11, v8, 5
+; VLOPT-NEXT:    vadd.vv v8, v11, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i16> @llvm.riscv.vnsrl.nxv4i16.nxv4i32(<vscale x 4 x i16> poison, <vscale x 4 x i32> %a, iXLen 5, iXLen -1)
+  %2 = call <vscale x 4 x i16> @llvm.riscv.vadd.nxv4i16.nxv4i16(<vscale x 4 x i16> poison, <vscale x 4 x i16> %1, <vscale x 4 x i16> %b, iXLen %vl)
+  ret <vscale x 4 x i16> %2
+}
+
+
+
+define <vscale x 4 x i32> @vmul_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmul_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmul_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmul_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmul_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmul_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmul.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulh_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulh_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulh.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulh_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulh.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulh.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulh_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulh_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulh.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulh_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulh.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulh.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhu.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhu.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhu.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhu.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhsu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhsu.vv v8, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhsu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhsu.vv v8, v8, v10
+; VLOPT-NEXT:    vmul.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhsu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmulhsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vmulhsu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmulhsu.vx v10, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmul.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmulhsu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmulhsu.vx v10, v8, a0
+; VLOPT-NEXT:    vmul.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmulhsu.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vmul.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmacc_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmacc.vx v10, a0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmacc_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmacc.vx v10, a0, v8
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v10, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmacc.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwmaccu_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmaccu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmaccu.vx v10, a0, v8
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmaccu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmaccu.vx v10, a0, v8
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v10, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmaccu.nxv4i32.i16(<vscale x 4 x i32> poison, i16 %b, <vscale x 4 x i16> %a, iXLen -1, iXLen 0)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmv_v_i(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
+; NOVLOPT-LABEL: vmv_v_i:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmv.v.i v10, 5
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmv_v_i:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmv.v.i v10, 5
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(<vscale x 4 x i32> poison, i32 5, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vmv_v_x(<vscale x 4 x i32> %a, i32 %x, iXLen %vl) {
+; NOVLOPT-LABEL: vmv_v_x:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vmv.v.x v10, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v8
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmv_v_x:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vmv.v.x v10, a0
+; VLOPT-NEXT:    vadd.vv v8, v10, v8
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vmv.v.x.nxv4i32(<vscale x 4 x i32> poison, i32 %x, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %a, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+define <vscale x 4 x i32> @vwsll_vi(<vscale x 4 x i16> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwsll_vi:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwsll.vi v12, v8, 1
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwsll_vi:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT:    vwsll.vi v12, v8, 1
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v10
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwsll.nxv4i32.nxv4i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a,iXLen 1, iXLen -1)
+  %2 = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %b, iXLen %vl)
+  ret <vscale x 4 x i32> %2
+}
+
+; Test getOperandInfo
+
+define <vscale x 1 x i8> @vmerge_vim(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i1> %m, iXLen %vl) {
+; NOVLOPT-LABEL: vmerge_vim:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vmv.v.x v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmerge.vim v8, v8, 2, v0
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmerge_vim:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; VLOPT-NEXT:    vmv.v.x v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmerge.vim v8, v8, 2, v0
+; VLOPT-NEXT:    ret
+  %2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(<vscale x 1 x i8> %a, i8 %b, iXLen -1)
+  %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, i8 2, <vscale x 1 x i1> %m, iXLen %vl)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @vmerge_vxm(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i1> %m, iXLen %vl) {
+; NOVLOPT-LABEL: vmerge_vxm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vmv.v.x v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmerge.vxm v8, v8, a0, v0
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmerge_vxm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; VLOPT-NEXT:    vmv.v.x v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmerge.vxm v8, v8, a0, v0
+; VLOPT-NEXT:    ret
+  %2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(<vscale x 1 x i8> %a, i8 %b, iXLen -1)
+  %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, i8 %b, <vscale x 1 x i1> %m, iXLen %vl)
+  ret <vscale x 1 x i8> %3
+}
+
+define <vscale x 1 x i8> @vmerge_vvm(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl) {
+; NOVLOPT-LABEL: vmerge_vvm:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e8, mf8, tu, ma
+; NOVLOPT-NEXT:    vmv.v.x v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; NOVLOPT-NEXT:    vmerge.vvm v8, v8, v9, v0
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vmerge_vvm:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; VLOPT-NEXT:    vmv.v.x v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e8, mf8, ta, ma
+; VLOPT-NEXT:    vmerge.vvm v8, v8, v9, v0
+; VLOPT-NEXT:    ret
+  %2 = call <vscale x 1 x i8> @llvm.riscv.vmv.v.x.nxv1i8(<vscale x 1 x i8> %a, i8 %b, iXLen -1)
+  %3 = call <vscale x 1 x i8> @llvm.riscv.vmerge.nxv1i8.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i8> %2, <vscale x 1 x i8> %c, <vscale x 1 x i1> %m, iXLen %vl)
+  ret <vscale x 1 x i8> %3
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
index 2b3c5417b15b5..a7abd90ea7391 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s
-; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,NOVLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,NOVLOPT
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvl512b -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvl512b -riscv-enable-vl-optimizer -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,VLOPT
+
 
 define <2 x i32> @vdot_lane_s32(<2 x i32> noundef %var_1, <8 x i8> noundef %var_3, <8 x i8> noundef %var_5, <8 x i16> %x) {
 ; CHECK-LABEL: vdot_lane_s32:
@@ -81,3 +84,6 @@ entry:
 
   ret <vscale x 2 x i16> %x
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; NOVLOPT: {{.*}}
+; VLOPT: {{.*}}