17
17
C10_DECLARE_int32 (caffe2_dnnlowp_nbits_in_non_outlier);
18
18
C10_DECLARE_int32 (caffe2_dnnlowp_copy_to_32bit_frequency);
19
19
C10_DECLARE_bool (caffe2_dnnlowp_shared_int32_buffer);
20
-
21
20
// Thresholds to fallback to 32-bit accumulation when 16-bit accumulation
22
21
// doesn't provide performance benefits.
23
22
C10_DEFINE_double (
@@ -62,43 +61,26 @@ ConvDNNLowPAcc16Op<ReluFused>::ConvDNNLowPAcc16Op(
62
61
template <bool ReluFused>
63
62
bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
64
63
if (fallback_to_32_bit_accumulation_) {
65
- return true ;
66
- }
67
-
68
- if (!BaseType::GetQuantizationParameters_ ()) {
69
- return false ;
70
- }
71
-
72
- if (!Wq_acc16_packed_ &&
73
- this ->template InputIsType <Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
74
- CAFFE_ENFORCE_EQ (
75
- this ->order_ ,
76
- StorageOrder::NHWC,
77
- " Pre-packed weight only works with NHWC layout" );
78
- // If the input is already packed
79
- const auto & packed_filter =
80
- this ->template Input <Int8ConvDNNLowPPackedWeightBlob>(FILTER);
81
- Wq_outlier_ = packed_filter.W_outlier ;
82
- Wq_acc16_packed_ = packed_filter.W_acc16 ;
83
-
84
- if (nbits_in_non_outlier_ != packed_filter.nbits_in_non_outlier ) {
85
- LOG (WARNING)
86
- << " nbits_in_non_outlier in packed weight "
87
- << packed_filter.nbits_in_non_outlier
88
- << " doesn't match with nbits_in_non_outlier specified in operator "
89
- << nbits_in_non_outlier_;
90
- }
91
-
92
- first_invocation_ = false ;
93
- return true ;
64
+ // Short cut if we already know we are falling back to acc32
65
+ return BaseType::GetQuantizationParameters_ ();
94
66
}
95
67
96
68
int kernel_dim = this ->KernelDim_ ();
97
69
const auto & filter = InputTensorCPU_ (FILTER);
98
70
int num_out_channels = filter.dim32 (0 );
99
71
100
72
// Check if we should fallback to 32-bit accumulation
101
- if (this ->order_ == StorageOrder::NHWC) {
73
+ // We should do this before GetQuantizationParameters_ to make sure
74
+ // GetQuantizationParameters_ initialize things like Wq_packed_ for acc32
75
+ // properly.
76
+
77
+ // We can't fallback if layout is not NHWC or
78
+ // if weight is prepacked and the prepacked weight doesn't have acc32.
79
+ bool can_fallback_to_32_bit_accumulation =
80
+ this ->order_ == StorageOrder::NHWC &&
81
+ (!this ->template InputIsType <Int8ConvDNNLowPPackedWeightBlob>(FILTER) ||
82
+ this ->template Input <Int8ConvDNNLowPPackedWeightBlob>(FILTER).W );
83
+ if (can_fallback_to_32_bit_accumulation) {
102
84
const Tensor& X = InputTensorCPU_ (INPUT);
103
85
int N = X.dim32 (0 );
104
86
@@ -121,31 +103,71 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
121
103
}
122
104
123
105
if (N * output_image_size < FLAGS_caffe2_dnnlowp_acc16_m_threshold) {
124
- LOG (INFO) << " M " << N * output_image_size
125
- << " of Conv layer with weight blob "
126
- << this ->debug_def ().input (1 ) << " is smaller than threshold "
127
- << FLAGS_caffe2_dnnlowp_acc16_m_threshold
128
- << " . Falling back to acc32" ;
106
+ LOG_FIRST_N (INFO, 10 )
107
+ << " M " << N * output_image_size << " of Conv layer with weight blob "
108
+ << this ->debug_def ().input (FILTER) << " is smaller than threshold "
109
+ << FLAGS_caffe2_dnnlowp_acc16_m_threshold
110
+ << " . Falling back to acc32" ;
111
+ fallback_to_32_bit_accumulation_ = true ;
112
+ }
113
+ if (!fallback_to_32_bit_accumulation_ &&
114
+ num_out_channels / group_ < acc16_n_threshold) {
115
+ LOG_FIRST_N (INFO, 10 )
116
+ << " N " << num_out_channels / group_
117
+ << " of Conv layer with weight blob "
118
+ << this ->debug_def ().input (FILTER) << " is smaller than threshold "
119
+ << acc16_n_threshold << " . Falling back to acc32" ;
129
120
fallback_to_32_bit_accumulation_ = true ;
130
- return true ;
131
121
}
132
- if (num_out_channels / group_ < acc16_n_threshold ) {
133
- LOG (INFO) << " N " << num_out_channels / group_
134
- << " of Conv layer with weight blob "
135
- << this ->debug_def ().input (1 ) << " is smaller than threshold "
136
- << acc16_n_threshold << " . Falling back to acc32" ;
122
+ if (!fallback_to_32_bit_accumulation_ && kernel_dim < acc16_k_threshold ) {
123
+ LOG_FIRST_N (INFO, 10 )
124
+ << " K " << kernel_dim << " of Conv layer with weight blob "
125
+ << this ->debug_def ().input (FILTER ) << " is smaller than threshold "
126
+ << acc16_k_threshold << " . Falling back to acc32" ;
137
127
fallback_to_32_bit_accumulation_ = true ;
138
- return true ;
139
128
}
140
- if (kernel_dim < acc16_k_threshold) {
141
- LOG (INFO) << " K " << kernel_dim << " of Conv layer with weight blob "
142
- << this ->debug_def ().input (1 ) << " is smaller than threshold "
143
- << acc16_k_threshold << " . Falling back to acc32" ;
129
+ if (!fallback_to_32_bit_accumulation_ &&
130
+ this ->template InputIsType <Int8ConvDNNLowPPackedWeightBlob>(FILTER) &&
131
+ !this ->template Input <Int8ConvDNNLowPPackedWeightBlob>(FILTER)
132
+ .W_acc16 ) {
133
+ LOG_FIRST_N (INFO, 10 )
134
+ << " Falling back to acc32 because packed weight for acc16 is not "
135
+ " available" ;
144
136
fallback_to_32_bit_accumulation_ = true ;
145
- return true ;
146
137
}
147
138
}
148
139
140
+ if (!BaseType::GetQuantizationParameters_ ()) {
141
+ return false ;
142
+ }
143
+
144
+ if (fallback_to_32_bit_accumulation_) {
145
+ return true ;
146
+ }
147
+
148
+ if (!Wq_acc16_packed_ &&
149
+ this ->template InputIsType <Int8ConvDNNLowPPackedWeightBlob>(FILTER)) {
150
+ CAFFE_ENFORCE_EQ (
151
+ this ->order_ ,
152
+ StorageOrder::NHWC,
153
+ " Pre-packed weight only works with NHWC layout" );
154
+ // If the input is already packed
155
+ const auto & packed_filter =
156
+ this ->template Input <Int8ConvDNNLowPPackedWeightBlob>(FILTER);
157
+ Wq_outlier_ = packed_filter.W_outlier ;
158
+ Wq_acc16_packed_ = packed_filter.W_acc16 ;
159
+
160
+ if (nbits_in_non_outlier_ != packed_filter.nbits_in_non_outlier ) {
161
+ LOG_FIRST_N (WARNING, 10 )
162
+ << " nbits_in_non_outlier in packed weight "
163
+ << packed_filter.nbits_in_non_outlier
164
+ << " doesn't match with nbits_in_non_outlier specified in operator "
165
+ << nbits_in_non_outlier_;
166
+ }
167
+ first_invocation_ = false ;
168
+ return true ;
169
+ }
170
+
149
171
// Separate out outliers
150
172
if (!Wq_outlier_ && this ->order_ == StorageOrder::NHWC &&
151
173
nbits_in_non_outlier_ < 8 ) {
@@ -159,20 +181,24 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
159
181
W_quantized_));
160
182
int outlier_cnt = Wq_outlier_->ColPtr ()[num_out_channels];
161
183
162
- LOG (INFO) << " Proportion of outlier for Conv layer with weight blob "
163
- << this ->debug_def ().input (1 ) << " is "
164
- << static_cast <float >(outlier_cnt) / W_quantized_.size ();
165
- LOG (INFO) << " nbits_in_non_outlier " << nbits_in_non_outlier_
166
- << " copy_to_32bit_frequency " << copy_to_32bit_frequency_;
167
-
168
- if (static_cast <float >(outlier_cnt) / W_quantized_.size () >
169
- FLAGS_caffe2_dnnlowp_acc16_density_threshold) {
170
- LOG (INFO) << " Density of outliers is higher than threshold "
171
- << FLAGS_caffe2_dnnlowp_acc16_density_threshold
172
- << " . Falling back to acc32" ;
184
+ LOG_FIRST_N (INFO, 10 )
185
+ << " Proportion of outlier for Conv layer with weight blob "
186
+ << this ->debug_def ().input (FILTER) << " is "
187
+ << static_cast <float >(outlier_cnt) / W_quantized_.size ();
188
+ LOG_FIRST_N (INFO, 10 ) << " nbits_in_non_outlier " << nbits_in_non_outlier_
189
+ << " copy_to_32bit_frequency "
190
+ << copy_to_32bit_frequency_;
191
+
192
+ if (can_fallback_to_32_bit_accumulation &&
193
+ static_cast <float >(outlier_cnt) / W_quantized_.size () >
194
+ FLAGS_caffe2_dnnlowp_acc16_density_threshold) {
195
+ LOG_FIRST_N (INFO, 10 ) << " Density of outliers is higher than threshold "
196
+ << FLAGS_caffe2_dnnlowp_acc16_density_threshold
197
+ << " . Falling back to acc32" ;
173
198
fallback_to_32_bit_accumulation_ = true ;
174
199
Wq_outlier_.reset ();
175
- return true ;
200
+ // We need to call GetQuantizationParameters_ again to pack for acc32
201
+ return BaseType::GetQuantizationParameters_ ();
176
202
}
177
203
}
178
204
@@ -193,17 +219,18 @@ bool ConvDNNLowPAcc16Op<ReluFused>::GetQuantizationParameters_() {
193
219
static int log_occurences = 0 ;
194
220
if (log_occurences < 32 ) {
195
221
++log_occurences;
196
- LOG (WARNING) << " Conv with weight " << this ->debug_def ().input (FILTER)
197
- << " falls back to slow path because " << reason;
222
+ LOG_FIRST_N (WARNING, 10 )
223
+ << " Conv with weight " << this ->debug_def ().input (FILTER)
224
+ << " falls back to slow path because " << reason;
198
225
}
199
226
}
200
227
}
201
228
if (nbits_in_non_outlier_ < 8 && this ->order_ != StorageOrder::NHWC) {
202
229
static int log_occurences = 0 ;
203
230
if (log_occurences < 32 ) {
204
231
++log_occurences;
205
- LOG (WARNING) << " Outlier-aware quantization only supports "
206
- " NHWC layout" ;
232
+ LOG_FIRST_N (WARNING, 10 ) << " Outlier-aware quantization only supports "
233
+ " NHWC layout" ;
207
234
}
208
235
}
209
236
first_invocation_ = false ;
@@ -359,7 +386,7 @@ bool ConvDNNLowPAcc16Op<ReluFused>::RunOnDeviceWithOrderNCHW() {
359
386
static int log_occurences = 0 ;
360
387
if (log_occurences < 32 ) {
361
388
++log_occurences;
362
- LOG (WARNING)
389
+ LOG_FIRST_N (WARNING, 10 )
363
390
<< " Consider using DNNLOWP instead of DNNLOWP_ACC16 engine since "
364
391
" we're falling back to a slow path because of NCHW layout" ;
365
392
}
0 commit comments