Skip to content

Commit 421b18c

Browse files
viswanathgsRob Kunkle
authored and
Rob Kunkle
committed
Clip horizontal bounding boxes during rotated detection for backward compatibility (pytorch#9403)
Summary: Pull Request resolved: pytorch#9403 In BBoxTransform and GenerateProposal ops, clip_boxes makes sure the bbox fits within the images. For rotated boxes, this doesn't always make sense as there could be multiple ways to clip a rotated box within an image boundary. Moreover, clipping to a horizontal box means we leave out pixels of interest potentially. Therefore, we clip only boxes with angle almost equal to 0 (with a specified `angle_thresh` tolerance). Reviewed By: pjh5 Differential Revision: D8828588 fbshipit-source-id: 39c1eafdb5d39d383780faa0a47e76149145e50c
1 parent b920698 commit 421b18c

8 files changed

+272
-52
lines changed

caffe2/operators/bbox_transform_op.cc

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ Transform proposal bounding boxes to target bounding box using bounding box
5454
"angle_bound_hi",
5555
"int (default 90 degrees). If set, for rotated boxes, angle is "
5656
"normalized to be within [angle_bound_lo, angle_bound_hi].")
57+
.Arg(
58+
"clip_angle_thresh",
59+
"float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
60+
"within this threshold of tolerance for backward compatibility. "
61+
"Set to negative value for no clipping.")
5762
.Input(
5863
0,
5964
"rois",
@@ -168,7 +173,8 @@ bool BBoxTransformOp<float, CPUContext>::RunOnDevice() {
168173
angle_bound_on_,
169174
angle_bound_lo_,
170175
angle_bound_hi_);
171-
EArrXXf clip_boxes = utils::clip_boxes(trans_boxes, img_h, img_w);
176+
EArrXXf clip_boxes =
177+
utils::clip_boxes(trans_boxes, img_h, img_w, clip_angle_thresh_);
172178
// Do not apply scale for angle in rotated boxes
173179
clip_boxes.leftCols(4) *= scale_after;
174180
new_boxes.block(offset, k * box_dim, num_rois, box_dim) = clip_boxes;

caffe2/operators/bbox_transform_op.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,9 @@ class BBoxTransformOp final : public Operator<Context> {
2929
angle_bound_lo_(
3030
OperatorBase::GetSingleArgument<int>("angle_bound_lo", -90)),
3131
angle_bound_hi_(
32-
OperatorBase::GetSingleArgument<int>("angle_bound_hi", 90)) {
32+
OperatorBase::GetSingleArgument<int>("angle_bound_hi", 90)),
33+
clip_angle_thresh_(
34+
OperatorBase::GetSingleArgument<float>("clip_angle_thresh", 1.0)) {
3335
CAFFE_ENFORCE_EQ(
3436
weights_.size(),
3537
4,
@@ -59,6 +61,10 @@ class BBoxTransformOp final : public Operator<Context> {
5961
bool angle_bound_on_{true};
6062
int angle_bound_lo_{-90};
6163
int angle_bound_hi_{90};
64+
// For RRPN, clip almost horizontal boxes within this threshold of
65+
// tolerance for backward compatibility. Set to negative value for
66+
// no clipping.
67+
float clip_angle_thresh_{1.0};
6268
};
6369

6470
} // namespace caffe2

caffe2/operators/generate_proposals_op.cc

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,8 @@ void GenerateProposalsOp<CPUContext>::ProposalsForOneImage(
197197

198198
// 2. clip proposals to image (may result in proposals with zero area
199199
// that will be removed in the next step)
200-
// TODO (viswanath): Should we clip rotated boxes as well?
201-
proposals = utils::clip_boxes(proposals, im_info[0], im_info[1]);
200+
proposals =
201+
utils::clip_boxes(proposals, im_info[0], im_info[1], clip_angle_thresh_);
202202

203203
// 3. remove predicted boxes with either height or width < min_size
204204
auto keep = utils::filter_boxes(proposals, min_size, im_info);
@@ -342,6 +342,29 @@ non-maximum suppression is applied to generate the final bounding boxes.
342342
.Arg("post_nms_topN", "(int) RPN_POST_NMS_TOP_N")
343343
.Arg("nms_thresh", "(float) RPN_NMS_THRESH")
344344
.Arg("min_size", "(float) RPN_MIN_SIZE")
345+
.Arg(
346+
"correct_transform_coords",
347+
"bool (default false), Correct bounding box transform coordates,"
348+
" see bbox_transform() in boxes.py "
349+
"Set to true to match the detectron code, set to false for backward"
350+
" compatibility")
351+
.Arg(
352+
"angle_bound_on",
353+
"bool (default true). If set, for rotated boxes, angle is "
354+
"normalized to be within [angle_bound_lo, angle_bound_hi].")
355+
.Arg(
356+
"angle_bound_lo",
357+
"int (default -90 degrees). If set, for rotated boxes, angle is "
358+
"normalized to be within [angle_bound_lo, angle_bound_hi].")
359+
.Arg(
360+
"angle_bound_hi",
361+
"int (default 90 degrees). If set, for rotated boxes, angle is "
362+
"normalized to be within [angle_bound_lo, angle_bound_hi].")
363+
.Arg(
364+
"clip_angle_thresh",
365+
"float (default 1.0 degrees). For RRPN, clip almost horizontal boxes "
366+
"within this threshold of tolerance for backward compatibility. "
367+
"Set to negative value for no clipping.")
345368
.Input(0, "scores", "Scores from conv layer, size (img_count, A, H, W)")
346369
.Input(
347370
1,

caffe2/operators/generate_proposals_op.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,9 @@ class GenerateProposalsOp final : public Operator<Context> {
8484
angle_bound_lo_(
8585
OperatorBase::GetSingleArgument<int>("angle_bound_lo", -90)),
8686
angle_bound_hi_(
87-
OperatorBase::GetSingleArgument<int>("angle_bound_hi", 90)) {}
87+
OperatorBase::GetSingleArgument<int>("angle_bound_hi", 90)),
88+
clip_angle_thresh_(
89+
OperatorBase::GetSingleArgument<float>("clip_angle_thresh", 1.0)) {}
8890

8991
~GenerateProposalsOp() {}
9092

@@ -127,6 +129,10 @@ class GenerateProposalsOp final : public Operator<Context> {
127129
bool angle_bound_on_{true};
128130
int angle_bound_lo_{-90};
129131
int angle_bound_hi_{90};
132+
// For RRPN, clip almost horizontal boxes within this threshold of
133+
// tolerance for backward compatibility. Set to negative value for
134+
// no clipping.
135+
float clip_angle_thresh_{1.0};
130136
};
131137

132138
} // namespace caffe2

caffe2/operators/generate_proposals_op_test.cc

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
320320
// Similar to TestRealDownSampled but for rotated boxes with angle info.
321321
float angle = 0;
322322
float delta_angle = 0;
323+
float clip_angle_thresh = 1.0;
323324

324325
Workspace ws;
325326
OperatorDef def;
@@ -407,33 +408,37 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
407408

408409
vector<float> im_info{60, 80, 0.166667f};
409410
// vector<float> anchors{-38, -16, 53, 31, -120, -120, 135, 135};
410-
vector<float> anchors{8, 8, 92, 48, angle, 8, 8, 256, 256, angle};
411-
412-
// Although angle == 0, the results aren't exactly the same as
413-
// TestRealDownSampled because because clip_boxes() is not performed
414-
// for RRPN style boxes.
415-
ERMatXf rois_gt(13, 6);
416-
rois_gt << 0, 6.55346, 25.3227, 253.447, 291.446, 0, 0, 55.3932, 33.3369,
417-
253.731, 289.158, 0, 0, 6.48163, 24.3478, 92.3015, 38.6944, 0, 0, 70.3089,
418-
26.7894, 92.3453, 38.5539, 0, 0, 22.3067, 26.7714, 92.3424, 38.5243, 0, 0,
419-
054.084, 26.8413, 92.3938, 38.798, 0, 0, 5.33962, 42.2077, 92.5497,
420-
38.2259, 0, 0, 6.36709, 58.24, 92.16, 37.4372, 0, 0, 69.65, 48.6713,
421-
92.1521, 37.3668, 0, 0, 20.4147, 44.4783, 91.7111, 34.0295, 0, 0, 033.079,
422-
41.5149, 92.3244, 36.4278, 0, 0, 41.8235, 037.291, 90.2815, 034.872, 0, 0,
423-
13.8486, 48.662, 88.7818, 28.875, 0;
424-
vector<float> rois_probs_gt{0.0266914,
425-
0.005621,
426-
0.00544219,
427-
0.00120544,
428-
0.00119208,
429-
0.00117182,
430-
0.000617993,
431-
0.000472735,
432-
6.09605e-05,
433-
1.05262e-05,
434-
8.91026e-06,
435-
9.29537e-09,
436-
1.13482e-10};
411+
// Anchors in [x_ctr, y_ctr, w, h, angle] format
412+
vector<float> anchors{7.5, 7.5, 92, 48, angle, 7.5, 7.5, 256, 256, angle};
413+
414+
// Results should exactly be the same as TestRealDownSampled since
415+
// angle = 0 for all boxes and clip_angle_thresh > 0 (which means
416+
// all horizontal boxes will be clipped to maintain backward compatibility).
417+
ERMatXf rois_gt_xyxy(9, 5);
418+
rois_gt_xyxy << 0, 0, 0, 79, 59, 0, 0, 5.0005703f, 51.6324f, 42.6950f, 0,
419+
24.13628387f, 7.51243401f, 79, 45.0663f, 0, 0, 7.50924301f, 67.4779f,
420+
45.0336, 0, 0, 23.09477997f, 50.61448669f, 59, 0, 0, 39.52141571f,
421+
51.44710541f, 59, 0, 23.57396317f, 29.98791885f, 79, 59, 0, 0,
422+
41.90219116f, 79, 59, 0, 0, 23.30098343f, 78.2413f, 58.7287f;
423+
ERMatXf rois_gt(9, 6);
424+
// Batch ID
425+
rois_gt.block(0, 0, rois_gt.rows(), 1) =
426+
ERMatXf::Constant(rois_gt.rows(), 1, 0.0);
427+
// rois_gt in [x_ctr, y_ctr, w, h] format
428+
rois_gt.block(0, 1, rois_gt.rows(), 4) =
429+
boxes_xyxy_to_xywh(rois_gt_xyxy.block(0, 1, rois_gt.rows(), 4));
430+
// Angle
431+
rois_gt.block(0, 5, rois_gt.rows(), 1) =
432+
ERMatXf::Constant(rois_gt.rows(), 1, angle);
433+
vector<float> rois_probs_gt{2.66913995e-02f,
434+
5.44218998e-03f,
435+
1.20544003e-03f,
436+
1.19207997e-03f,
437+
6.17993006e-04f,
438+
4.72735002e-04f,
439+
6.09605013e-05f,
440+
1.50015003e-05f,
441+
8.91025957e-06f};
437442

438443
AddInput(vector<TIndex>{img_count, A, H, W}, scores, "scores", &ws);
439444
AddInput(
@@ -450,6 +455,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotatedAngle0) {
450455
def.add_arg()->CopyFrom(MakeArgument("nms_thresh", 0.7f));
451456
def.add_arg()->CopyFrom(MakeArgument("min_size", 16.0f));
452457
def.add_arg()->CopyFrom(MakeArgument("correct_transform_coords", true));
458+
def.add_arg()->CopyFrom(MakeArgument("clip_angle_thresh", clip_angle_thresh));
453459

454460
unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
455461
EXPECT_NE(nullptr, op.get());
@@ -484,6 +490,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotated) {
484490
float angle = 45.0;
485491
float delta_angle = 0.174533; // 0.174533 radians -> 10 degrees
486492
float expected_angle = 55.0;
493+
float clip_angle_thresh = 1.0;
487494

488495
Workspace ws;
489496
OperatorDef def;
@@ -588,6 +595,7 @@ TEST(GenerateProposalsTest, TestRealDownSampledRotated) {
588595
def.add_arg()->CopyFrom(MakeArgument("nms_thresh", 0.7f));
589596
def.add_arg()->CopyFrom(MakeArgument("min_size", 16.0f));
590597
def.add_arg()->CopyFrom(MakeArgument("correct_transform_coords", true));
598+
def.add_arg()->CopyFrom(MakeArgument("clip_angle_thresh", clip_angle_thresh));
591599

592600
unique_ptr<OperatorBase> op(CreateOperator(def, &ws));
593601
EXPECT_NE(nullptr, op.get());

caffe2/operators/generate_proposals_op_util_boxes.h

Lines changed: 104 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -192,23 +192,50 @@ EArrXXt<typename Derived1::Scalar> bbox_transform(
192192
}
193193
}
194194

195+
template <class Derived>
196+
EArrXXt<typename Derived::Scalar> bbox_xyxy_to_ctrwh(
197+
const Eigen::ArrayBase<Derived>& boxes) {
198+
CAFFE_ENFORCE_EQ(boxes.cols(), 4);
199+
200+
const auto& x1 = boxes.col(0);
201+
const auto& y1 = boxes.col(1);
202+
const auto& x2 = boxes.col(2);
203+
const auto& y2 = boxes.col(3);
204+
205+
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
206+
ret.col(0) = (x1 + x2) / 2.0; // x_ctr
207+
ret.col(1) = (y1 + y2) / 2.0; // y_ctr
208+
ret.col(2) = x2 - x1 + 1.0; // w
209+
ret.col(3) = y2 - y1 + 1.0; // h
210+
return ret;
211+
}
212+
213+
template <class Derived>
214+
EArrXXt<typename Derived::Scalar> bbox_ctrwh_to_xyxy(
215+
const Eigen::ArrayBase<Derived>& boxes) {
216+
CAFFE_ENFORCE_EQ(boxes.cols(), 4);
217+
218+
const auto& x_ctr = boxes.col(0);
219+
const auto& y_ctr = boxes.col(1);
220+
const auto& w = boxes.col(2);
221+
const auto& h = boxes.col(3);
222+
223+
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), 4);
224+
ret.col(0) = x_ctr - (w - 1) / 2.0; // x1
225+
ret.col(1) = y_ctr - (h - 1) / 2.0; // y1
226+
ret.col(2) = x_ctr + (w - 1) / 2.0; // x2
227+
ret.col(3) = y_ctr + (h - 1) / 2.0; // y2
228+
return ret;
229+
}
230+
195231
// Clip boxes to image boundaries
196232
// boxes: pixel coordinates of bounding box, size (M * 4)
197-
//
198-
// For rotated boxes with angle support (M * 5), we don't clip and just
199-
// return early. It's tricky to make the entire rectangular box fit within the
200-
// image and still be able to not leave out pixels of interest.
201-
// We rely on upstream ops like RoIAlignRotated safely handling such cases.
202233
template <class Derived>
203-
EArrXXt<typename Derived::Scalar>
204-
clip_boxes(const Eigen::ArrayBase<Derived>& boxes, int height, int width) {
205-
CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
206-
if (boxes.cols() == 5) {
207-
// No clipping for rotated boxes.
208-
// TODO (viswanath): Should this be implemented for backward compatibility
209-
// with angle=0 case?
210-
return boxes;
211-
}
234+
EArrXXt<typename Derived::Scalar> clip_boxes_upright(
235+
const Eigen::ArrayBase<Derived>& boxes,
236+
int height,
237+
int width) {
238+
CAFFE_ENFORCE(boxes.cols() == 4);
212239

213240
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
214241

@@ -224,6 +251,69 @@ clip_boxes(const Eigen::ArrayBase<Derived>& boxes, int height, int width) {
224251
return ret;
225252
}
226253

254+
// Similar to clip_boxes_upright but handles rotated boxes with angle info.
255+
// boxes: size (M, 5), format [ctr_x; ctr_y; width; height; angle (in degrees)]
256+
//
257+
// Clipping is only performed for boxes that are almost upright
258+
// (within a given `angle_thresh` tolerance) to maintain backward compatibility
259+
// for non-rotated boxes.
260+
//
261+
// We don't clip rotated boxes due to a couple of reasons:
262+
// (1) There are potentially multiple ways to clip a rotated box to make it
263+
// fit within the image.
264+
// (2) It's tricky to make the entire rectangular box fit within the image and
265+
// still be able to not leave out pixels of interest.
266+
// Therefore, we rely on upstream ops like RoIAlignRotated safely handling this.
267+
template <class Derived>
268+
EArrXXt<typename Derived::Scalar> clip_boxes_rotated(
269+
const Eigen::ArrayBase<Derived>& boxes,
270+
int height,
271+
int width,
272+
float angle_thresh = 1.0) {
273+
CAFFE_ENFORCE(boxes.cols() == 5);
274+
275+
const auto& angles = boxes.col(4);
276+
277+
// Filter boxes that are upright (with a tolerance of angle_thresh)
278+
EArrXXt<typename Derived::Scalar> upright_boxes;
279+
const auto& indices = GetArrayIndices(angles.abs() <= angle_thresh);
280+
GetSubArrayRows(boxes, AsEArrXt(indices), &upright_boxes);
281+
282+
// Convert to [x1, y1, x2, y2] format and clip them
283+
const auto& upright_boxes_xyxy =
284+
bbox_ctrwh_to_xyxy(upright_boxes.leftCols(4));
285+
const auto& clipped_upright_boxes_xyxy =
286+
clip_boxes_upright(upright_boxes_xyxy, height, width);
287+
288+
// Convert back to [x_ctr, y_ctr, w, h, angle] and update upright boxes
289+
upright_boxes.block(0, 0, upright_boxes.rows(), 4) =
290+
bbox_xyxy_to_ctrwh(clipped_upright_boxes_xyxy);
291+
292+
EArrXXt<typename Derived::Scalar> ret(boxes.rows(), boxes.cols());
293+
ret = boxes;
294+
for (int i = 0; i < upright_boxes.rows(); ++i) {
295+
ret.row(indices[i]) = upright_boxes.row(i);
296+
}
297+
return ret;
298+
}
299+
300+
// Clip boxes to image boundaries.
301+
template <class Derived>
302+
EArrXXt<typename Derived::Scalar> clip_boxes(
303+
const Eigen::ArrayBase<Derived>& boxes,
304+
int height,
305+
int width,
306+
float angle_thresh = 1.0) {
307+
CAFFE_ENFORCE(boxes.cols() == 4 || boxes.cols() == 5);
308+
if (boxes.cols() == 4) {
309+
// Upright boxes
310+
return clip_boxes_upright(boxes, height, width);
311+
} else {
312+
// Rotated boxes with angle info
313+
return clip_boxes_rotated(boxes, height, width, angle_thresh);
314+
}
315+
}
316+
227317
// Only keep boxes with both sides >= min_size and center within the image.
228318
// boxes: pixel coordinates of bounding box, size (M * 4)
229319
// im_info: [height, width, img_scale]

caffe2/operators/generate_proposals_op_util_boxes_test.cc

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,4 +105,33 @@ TEST(UtilsBoxesTest, TestBboxTransformRotatedNormalized) {
105105
EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-2);
106106
}
107107

108+
TEST(UtilsBoxesTest, ClipRotatedBoxes) {
109+
// Test utils::clip_boxes_rotated()
110+
using EMatXf = Eigen::MatrixXf;
111+
112+
int height = 800;
113+
int width = 600;
114+
EMatXf bbox(5, 5);
115+
bbox << 20, 20, 200, 150, 0, // Horizontal
116+
20, 20, 200, 150, 0.5, // Almost horizontal
117+
20, 20, 200, 150, 30, // Rotated
118+
300, 300, 200, 150, 30, // Rotated
119+
579, 779, 200, 150, -0.5; // Almost horizontal
120+
121+
// Test with no clipping
122+
float angle_thresh = -1.0;
123+
auto result = utils::clip_boxes(bbox.array(), height, width, angle_thresh);
124+
EXPECT_NEAR((result.matrix() - bbox).norm(), 0.0, 1e-4);
125+
126+
EMatXf result_gt(5, 5);
127+
result_gt << 59.75, 47.25, 120.5, 95.5, 0, 59.75, 47.25, 120.5, 95.5, 0.5, 20,
128+
20, 200, 150, 30, 300, 300, 200, 150, 30, 539.25, 751.75, 120.5, 95.5,
129+
-0.5;
130+
131+
// Test clipping with tolerance
132+
angle_thresh = 1.0;
133+
result = utils::clip_boxes(bbox.array(), height, width, angle_thresh);
134+
EXPECT_NEAR((result.matrix() - result_gt).norm(), 0.0, 1e-4);
135+
}
136+
108137
} // namespace caffe2

0 commit comments

Comments
 (0)