|
15 | 15 |
|
16 | 16 | namespace Eigen {
|
17 | 17 |
|
18 |
| -#ifdef EIGEN_USE_SIMPLE_THREAD_POOL |
19 |
| -namespace internal { |
20 |
| - |
21 |
| -template<typename LhsScalar, typename LhsMapper, typename Index> |
22 |
| -struct packLhsArg { |
23 |
| - LhsScalar* blockA; |
24 |
| - const LhsMapper& lhs; |
25 |
| - const Index m_start; |
26 |
| - const Index k_start; |
27 |
| - const Index mc; |
28 |
| - const Index kc; |
29 |
| -}; |
30 |
| - |
31 |
| -template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index> |
32 |
| -struct packRhsAndKernelArg { |
33 |
| - const MaxSizeVector<LhsScalar*>* blockAs; |
34 |
| - RhsScalar* blockB; |
35 |
| - const RhsMapper& rhs; |
36 |
| - OutputMapper& output; |
37 |
| - const Index m; |
38 |
| - const Index k; |
39 |
| - const Index n; |
40 |
| - const Index mc; |
41 |
| - const Index kc; |
42 |
| - const Index nc; |
43 |
| - const Index num_threads; |
44 |
| - const Index num_blockAs; |
45 |
| - const Index max_m; |
46 |
| - const Index k_block_idx; |
47 |
| - const Index m_block_idx; |
48 |
| - const Index n_block_idx; |
49 |
| - const Index m_blocks; |
50 |
| - const Index n_blocks; |
51 |
| - MaxSizeVector<Notification*>* kernel_notifications; |
52 |
| - const MaxSizeVector<Notification*>* lhs_notifications; |
53 |
| - const bool need_to_pack; |
54 |
| -}; |
55 |
| - |
56 |
| -} // end namespace internal |
57 |
| -#endif // EIGEN_USE_SIMPLE_THREAD_POOL |
58 |
| - |
59 | 18 | template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
|
60 | 19 | struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
|
61 | 20 | public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
|
@@ -112,7 +71,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
112 | 71 | TensorEvaluator(const XprType& op, const Device& device) :
|
113 | 72 | Base(op, device) {}
|
114 | 73 |
|
115 |
| -#ifndef EIGEN_USE_SIMPLE_THREAD_POOL |
116 | 74 | template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
|
117 | 75 | bool rhs_inner_dim_reordered, int Alignment>
|
118 | 76 | void evalProduct(Scalar* buffer) const {
|
@@ -763,288 +721,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
|
763 | 721 | return 0;
|
764 | 722 | }
|
765 | 723 |
|
766 |
| -#else // EIGEN_USE_SIMPLE_THREAD_POOL |
767 |
| - // TODO(ezhulenev): SimpleThreadPool will be removed in the future, and seems |
768 |
| - // like it's not worth adding output kernel support here. |
769 |
| - static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value, |
770 |
| - "SimpleThreadPool does not support contraction output kernels."); |
771 |
| - |
772 |
| - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> |
773 |
| - void evalProduct(Scalar* buffer) const { |
774 |
| - if (this->m_j_size == 1) { |
775 |
| - this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); |
776 |
| - return; |
777 |
| - } |
778 |
| - |
779 |
| - evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer); |
780 |
| - } |
781 |
| - |
782 |
| - template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> |
783 |
| - void evalGemm(Scalar* buffer) const { |
784 |
| - // columns in left side, rows in right side |
785 |
| - const Index k = this->m_k_size; |
786 |
| - |
787 |
| - // rows in left side |
788 |
| - const Index m = this->m_i_size; |
789 |
| - |
790 |
| - // columns in right side |
791 |
| - const Index n = this->m_j_size; |
792 |
| - |
793 |
| - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) |
794 |
| - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); |
795 |
| - |
796 |
| - |
797 |
| - const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size; |
798 |
| - const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size; |
799 |
| - |
800 |
| - typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, |
801 |
| - LeftEvaluator, left_nocontract_t, |
802 |
| - contract_t, lhs_packet_size, |
803 |
| - lhs_inner_dim_contiguous, |
804 |
| - false, Unaligned> LhsMapper; |
805 |
| - |
806 |
| - typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, |
807 |
| - RightEvaluator, right_nocontract_t, |
808 |
| - contract_t, rhs_packet_size, |
809 |
| - rhs_inner_dim_contiguous, |
810 |
| - rhs_inner_dim_reordered, Unaligned> RhsMapper; |
811 |
| - |
812 |
| - typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper; |
813 |
| - |
814 |
| - // TODO: packing could be faster sometimes if we supported row major tensor mappers |
815 |
| - typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr, |
816 |
| - Traits::LhsProgress, ColMajor> LhsPacker; |
817 |
| - typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker; |
818 |
| - |
819 |
| - // TODO: replace false, false with conjugate values? |
820 |
| - typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, |
821 |
| - Traits::mr, Traits::nr, false, false> GebpKernel; |
822 |
| - |
823 |
| - typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg; |
824 |
| - typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg; |
825 |
| - |
826 |
| - // initialize data mappers |
827 |
| - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, |
828 |
| - this->m_left_contracting_strides, this->m_k_strides); |
829 |
| - |
830 |
| - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, |
831 |
| - this->m_right_contracting_strides, this->m_k_strides); |
832 |
| - |
833 |
| - OutputMapper output(buffer, m); |
834 |
| - |
835 |
| - // compute block sizes (which depend on number of threads) |
836 |
| - const Index num_threads = this->m_device.numThreads(); |
837 |
| - internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads); |
838 |
| - Index mc = blocking.mc(); |
839 |
| - Index nc = blocking.nc(); |
840 |
| - Index kc = blocking.kc(); |
841 |
| - eigen_assert(mc <= m); |
842 |
| - eigen_assert(nc <= n); |
843 |
| - eigen_assert(kc <= k); |
844 |
| - |
845 |
| -#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) |
846 |
| - const Index k_blocks = CEIL_DIV(k, kc); |
847 |
| - const Index n_blocks = CEIL_DIV(n, nc); |
848 |
| - const Index m_blocks = CEIL_DIV(m, mc); |
849 |
| - const Index sizeA = mc * kc; |
850 |
| - const Index sizeB = kc * nc; |
851 |
| - |
852 |
| - /* cout << "m: " << m << " n: " << n << " k: " << k << endl; |
853 |
| - cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; |
854 |
| - cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; |
855 |
| - cout << "num threads: " << num_threads << endl; |
856 |
| - */ |
857 |
| - |
858 |
| - // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB |
859 |
| - // aren't 16 byte aligned segfaults will happen due to SIMD instructions |
860 |
| - // note: You can get away with allocating just a single blockA and offsets and meet the |
861 |
| - // the alignment requirements with the assumption that |
862 |
| - // (Traits::mr * sizeof(ResScalar)) % 16 == 0 |
863 |
| - const Index numBlockAs = numext::mini(num_threads, m_blocks); |
864 |
| - MaxSizeVector<LhsScalar *> blockAs(num_threads); |
865 |
| - for (int i = 0; i < num_threads; i++) { |
866 |
| - blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); |
867 |
| - } |
868 |
| - |
869 |
| - // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread |
870 |
| - // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. |
871 |
| - // Other options: (1) reuse memory when a thread finishes. con: tricky |
872 |
| - // (2) allocate block B memory in each thread. con: overhead |
873 |
| - MaxSizeVector<RhsScalar *> blockBs(n_blocks); |
874 |
| - for (int i = 0; i < n_blocks; i++) { |
875 |
| - blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); |
876 |
| - } |
877 |
| - |
878 |
| - // lhs_notifications starts with all null Notifications |
879 |
| - MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr); |
880 |
| - |
881 |
| - // this should really be numBlockAs * n_blocks; |
882 |
| - const Index num_kernel_notifications = num_threads * n_blocks; |
883 |
| - MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications, |
884 |
| - nullptr); |
885 |
| - |
886 |
| - for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { |
887 |
| - const Index k_start = k_block_idx * kc; |
888 |
| - // make sure we don't overshoot right edge of left matrix |
889 |
| - const Index actual_kc = numext::mini(k_start + kc, k) - k_start; |
890 |
| - |
891 |
| - for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { |
892 |
| - const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); |
893 |
| - |
894 |
| - for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { |
895 |
| - const Index m_start = mt_block_idx * mc; |
896 |
| - const Index actual_mc = numext::mini(m_start + mc, m) - m_start; |
897 |
| - eigen_assert(actual_mc > 0); |
898 |
| - |
899 |
| - Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; |
900 |
| - |
901 |
| - for (int i = 0; i < n_blocks; ++i) { |
902 |
| - Index notification_id = (blockAId * n_blocks + i); |
903 |
| - // Wait for any current kernels using this slot to complete |
904 |
| - // before using it. |
905 |
| - if (kernel_notifications[notification_id]) { |
906 |
| - wait_until_ready(kernel_notifications[notification_id]); |
907 |
| - delete kernel_notifications[notification_id]; |
908 |
| - } |
909 |
| - kernel_notifications[notification_id] = new Notification(); |
910 |
| - } |
911 |
| - const packLArg arg = { |
912 |
| - blockAs[blockAId], // blockA |
913 |
| - lhs, // lhs |
914 |
| - m_start, // m |
915 |
| - k_start, // k |
916 |
| - actual_mc, // mc |
917 |
| - actual_kc, // kc |
918 |
| - }; |
919 |
| - |
920 |
| - // Delete any existing notification since we may be |
921 |
| - // replacing it. The algorithm should ensure that there are |
922 |
| - // no existing waiters on this notification. |
923 |
| - delete lhs_notifications[blockAId]; |
924 |
| - lhs_notifications[blockAId] = |
925 |
| - this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg); |
926 |
| - } |
927 |
| - |
928 |
| - // now start kernels. |
929 |
| - const Index m_base_start = m_block_idx * mc; |
930 |
| - const bool need_to_pack = m_block_idx == 0; |
931 |
| - |
932 |
| - for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { |
933 |
| - const Index n_start = n_block_idx * nc; |
934 |
| - const Index actual_nc = numext::mini(n_start + nc, n) - n_start; |
935 |
| - |
936 |
| - // first make sure the previous kernels are all done before overwriting rhs. Also wait if |
937 |
| - // we're going to start new k. In both cases need_to_pack is true. |
938 |
| - if (need_to_pack) { |
939 |
| - for (Index i = num_blocks; i < num_threads; ++i) { |
940 |
| - Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; |
941 |
| - Index future_id = (blockAId * n_blocks + n_block_idx); |
942 |
| - wait_until_ready(kernel_notifications[future_id]); |
943 |
| - } |
944 |
| - } |
945 |
| - |
946 |
| - packRKArg arg = { |
947 |
| - &blockAs, // blockA |
948 |
| - blockBs[n_block_idx], // blockB |
949 |
| - rhs, // rhs |
950 |
| - output, // output |
951 |
| - m_base_start, // m |
952 |
| - k_start, // k |
953 |
| - n_start, // n |
954 |
| - mc, // mc |
955 |
| - actual_kc, // kc |
956 |
| - actual_nc, // nc |
957 |
| - num_threads, |
958 |
| - numBlockAs, |
959 |
| - m, |
960 |
| - k_block_idx, |
961 |
| - m_block_idx, |
962 |
| - n_block_idx, // n_block_idx |
963 |
| - m_blocks, // m_blocks |
964 |
| - n_blocks, // n_blocks |
965 |
| - &kernel_notifications, // kernel notifications |
966 |
| - &lhs_notifications, // lhs notifications |
967 |
| - need_to_pack, // need_to_pack |
968 |
| - }; |
969 |
| - |
970 |
| - // We asynchronously kick off this function, which ends up |
971 |
| - // notifying the appropriate kernel_notifications objects, |
972 |
| - // which this thread waits on before exiting. |
973 |
| - this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg); |
974 |
| - } |
975 |
| - } |
976 |
| - } |
977 |
| - |
978 |
| - // Make sure all the kernels are done. |
979 |
| - for (size_t i = 0; i < kernel_notifications.size(); ++i) { |
980 |
| - wait_until_ready(kernel_notifications[i]); |
981 |
| - delete kernel_notifications[i]; |
982 |
| - } |
983 |
| - |
984 |
| - // No need to wait for lhs notifications since they should have |
985 |
| - // already been waited on. Just clean them up. |
986 |
| - for (size_t i = 0; i < lhs_notifications.size(); ++i) { |
987 |
| - delete lhs_notifications[i]; |
988 |
| - } |
989 |
| - |
990 |
| - // deallocate all of the memory for both A and B's |
991 |
| - for (size_t i = 0; i < blockAs.size(); i++) { |
992 |
| - this->m_device.deallocate(blockAs[i]); |
993 |
| - } |
994 |
| - for (size_t i = 0; i < blockBs.size(); i++) { |
995 |
| - this->m_device.deallocate(blockBs[i]); |
996 |
| - } |
997 |
| - |
998 |
| -#undef CEIL_DIV |
999 |
| - } |
1000 |
| - |
1001 |
| - /* |
1002 |
| - * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing |
1003 |
| - * the LHS block, check that all of the kernels that worked on the same |
1004 |
| - * mt_block_idx in the previous m_block are done. |
1005 |
| - */ |
1006 |
| - template <typename packLArg, typename LhsPacker> |
1007 |
| - static void packLhs(const packLArg arg) { |
1008 |
| - // perform actual packing |
1009 |
| - LhsPacker pack_lhs; |
1010 |
| - pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); |
1011 |
| - } |
1012 |
| - |
1013 |
| - /* |
1014 |
| - * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that |
1015 |
| - * all kernels in the previous block are done. |
1016 |
| - * Then for each LHS future, we wait on the future and then call GEBP |
1017 |
| - * on the area packed by the future (which starts at |
1018 |
| - * blockA + future_idx * mt * kc) on the LHS and with the full packed |
1019 |
| - * RHS block. |
1020 |
| - * The output of this GEBP is written to output(m + i * mt, n). |
1021 |
| - */ |
1022 |
| - template <typename packRKArg, typename RhsPacker, typename GebpKernel> |
1023 |
| - static void packRhsAndKernel(packRKArg arg) { |
1024 |
| - if (arg.need_to_pack) { |
1025 |
| - RhsPacker pack_rhs; |
1026 |
| - pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); |
1027 |
| - } |
1028 |
| - |
1029 |
| - GebpKernel gebp; |
1030 |
| - for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { |
1031 |
| - const Index m_base_start = arg.m + arg.mc*mt_block_idx; |
1032 |
| - if (m_base_start < arg.max_m) { |
1033 |
| - Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; |
1034 |
| - wait_until_ready((*arg.lhs_notifications)[blockAId]); |
1035 |
| - const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; |
1036 |
| - gebp(arg.output.getSubMapper(m_base_start, arg.n), |
1037 |
| - (*arg.blockAs)[blockAId], arg.blockB, |
1038 |
| - actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); |
1039 |
| - |
1040 |
| - // Notify that the kernel is done. |
1041 |
| - const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; |
1042 |
| - (*arg.kernel_notifications)[set_idx]->Notify(); |
1043 |
| - } |
1044 |
| - } |
1045 |
| - } |
1046 |
| -#endif // EIGEN_USE_SIMPLE_THREAD_POOL |
1047 |
| - |
1048 | 724 | TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
|
1049 | 725 | bool shard_by_col, bool prepacked) const {
|
1050 | 726 | const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
|
|
0 commit comments