mindee · paulo9631 · Jun 30, 2026
diff --git a/doctr/datasets/coco_text.py b/doctr/datasets/coco_text.py
@@ -102,40 +102,50 @@ def __init__(
 
             for annotation in annotations:
                 x, y, w, h = annotation["bbox"]
-                if use_polygons:
-                    # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                    box = np.array(
-                        [
-                            [x, y],
-                            [x + w, y],
-                            [x + w, y + h],
-                            [x, y + h],
-                        ],
-                        dtype=np_dtype,
-                    )
-                else:
-                    # (xmin, ymin, xmax, ymax) coordinates
-                    box = [x, y, x + w, y + h]
+                box = self._build_box(x, y, w, h, use_polygons, np_dtype)
                 _targets.append((annotation["utf8_string"], box))
             text_targets, box_targets = zip(*_targets)
-
-            if recognition_task:
-                crops = crop_bboxes_from_image(
-                    img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
-                )
-                for crop, label in zip(crops, list(text_targets)):
-                    if label and " " not in label:
-                        self.data.append((crop, label))
-
-            elif detection_task:
-                self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
-            else:
-                self.data.append((
-                    img_path,
-                    dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
-                ))
+            self._process_task_sample(img_path, text_targets, box_targets, recognition_task, detection_task, tmp_root)
 
         self.root = tmp_root
 
+    @staticmethod
+    def _build_box(x: float, y: float, w: float, h: float, use_polygons: bool, np_dtype: type) -> list[float] | np.ndarray:
+        if use_polygons:
+            return np.array(
+                [
+                    [x, y],
+                    [x + w, y],
+                    [x + w, y + h],
+                    [x, y + h],
+                ],
+                dtype=np_dtype,
+            )
+        return [x, y, x + w, y + h]
+
+    def _process_task_sample(
+        self,
+        img_path: str,
+        text_targets: tuple[str, ...],
+        box_targets: tuple[list[float] | np.ndarray, ...],
+        recognition_task: bool,
+        detection_task: bool,
+        tmp_root: str,
+    ) -> None:
+        if recognition_task:
+            crops = crop_bboxes_from_image(
+                img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
+            )
+            for crop, label in zip(crops, list(text_targets)):
+                if label and " " not in label:
+                    self.data.append((crop, label))
+        elif detection_task:
+            self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
+        else:
+            self.data.append((
+                img_path,
+                dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
+            ))
+
     def extra_repr(self) -> str:
         return f"train={self.train}"
diff --git a/doctr/datasets/cord.py b/doctr/datasets/cord.py
@@ -84,31 +84,7 @@ def __init__(
             if not os.path.exists(os.path.join(tmp_root, img_path)):
                 raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")
 
-            stem = Path(img_path).stem
-            _targets = []
-            with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f:
-                label = json.load(f)
-                for line in label["valid_line"]:
-                    for word in line["words"]:
-                        if len(word["text"]) > 0:
-                            x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
-                            y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
-                            box: list[float] | np.ndarray
-                            if use_polygons:
-                                # (x, y) coordinates of top left, top right, bottom right, bottom left corners
-                                box = np.array(
-                                    [
-                                        [x[0], y[0]],
-                                        [x[1], y[1]],
-                                        [x[2], y[2]],
-                                        [x[3], y[3]],
-                                    ],
-                                    dtype=np_dtype,
-                                )
-                            else:
-                                # Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax
-                                box = [min(x), min(y), max(x), max(y)]
-                            _targets.append((word["text"], box))
+            _targets = self._process_image(img_path, tmp_root, use_polygons, np_dtype)
 
             text_targets, box_targets = zip(*_targets)
 
@@ -129,5 +105,36 @@ def __init__(
 
         self.root = tmp_root
 
+    def _process_image(
+        self,
+        img_path: str,
+        tmp_root: str,
+        use_polygons: bool,
+        np_dtype: np.dtype,
+    ) -> list[tuple[str, list[float] | np.ndarray]]:
+        stem = Path(img_path).stem
+        _targets: list[tuple[str, list[float] | np.ndarray]] = []
+        with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f:
+            label = json.load(f)
+            for line in label["valid_line"]:
+                for word in line["words"]:
+                    if len(word["text"]) > 0:
+                        x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
+                        y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
+                        if use_polygons:
+                            box = np.array(
+                                [
+                                    [x[0], y[0]],
+                                    [x[1], y[1]],
+                                    [x[2], y[2]],
+                                    [x[3], y[3]],
+                                ],
+                                dtype=np_dtype,
+                            )
+                        else:
+                            box = [min(x), min(y), max(x), max(y)]
+                        _targets.append((word["text"], box))
+        return _targets
+
     def extra_repr(self) -> str:
         return f"train={self.train}"
diff --git a/doctr/models/_utils.py b/doctr/models/_utils.py
@@ -31,6 +31,58 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
     return max(w / h, h / w)
 
 
+def _compute_contour_angles(
+    contours: list[np.ndarray],
+    n_ct: int,
+    ratio_threshold_for_lines: float,
+) -> list[float]:
+    angles = []
+    for contour in contours[:n_ct]:
+        _, (w, h), angle = cv2.minAreaRect(contour)
+        if w < h:
+            w, h = h, w
+            angle -= 90
+        while angle <= -90:
+            angle += 180
+        while angle > 90:
+            angle -= 180
+        if h > 0:
+            if w / h > ratio_threshold_for_lines:
+                angles.append(angle)
+            elif w / h < 1 / ratio_threshold_for_lines:
+                angles.append(angle - 90)
+    return angles
+
+
+def _compute_median_skew_angle(angles: list[float]) -> int:
+    if len(angles) == 0:
+        return 0
+    median = -median_low(angles)
+    skew_angle = -round(median) if abs(median) != 0 else 0
+    if abs(skew_angle) == 90:
+        skew_angle = 0
+    return skew_angle
+
+
+def _resolve_final_angle(
+    base_angle: int,
+    skew_angle: int,
+    is_confident: bool,
+    page_orientation: int,
+) -> int:
+    final_angle = base_angle + skew_angle
+    while final_angle > 180:
+        final_angle -= 360
+    while final_angle <= -180:
+        final_angle += 360
+    if is_confident:
+        if abs(skew_angle) % 90 == 0:
+            return page_orientation
+        if abs(skew_angle) == abs(page_orientation) and page_orientation != 0:
+            return page_orientation
+    return int(final_angle)
+
+
 def estimate_orientation(
     img: np.ndarray,
     general_page_orientation: tuple[int, float] | None = None,
@@ -56,7 +108,6 @@ def estimate_orientation(
     """
     assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"
 
-    # Convert image to grayscale if necessary
     if img.shape[-1] == 3:
         gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
         gray_img = cv2.medianBlur(gray_img, 5)
@@ -69,87 +120,24 @@ def estimate_orientation(
     base_angle = page_orientation if is_confident else 0
 
     if is_confident:
-        # We rotate the image to the general orientation which improves the detection
-        # No expand needed bitmap is already padded
         thresh = rotate_image(thresh, -base_angle)
-    else:  # That's only required if we do not work on the detection models bin map
-        # try to merge words in lines
+    else:
         (h, w) = img.shape[:2]
         k_x = max(1, (floor(w / 100)))
         k_y = max(1, (floor(h / 100)))
         kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
         thresh = cv2.dilate(thresh, kernel, iterations=1)
 
-    # extract contours
     contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
-
-    # Filter & Sort contours
     contours = sorted(
         [contour for contour in contours if cv2.contourArea(contour) > lower_area],
         key=get_max_width_length_ratio,
         reverse=True,
     )
 
-    angles = []
-    for contour in contours[:n_ct]:
-        _, (w, h), angle = cv2.minAreaRect(contour)
-
-        # OpenCV version-proof normalization: force 'w' to be the long side
-        # so the angle is consistently relative to the major axis.
-        # https://github.com/opencv/opencv/pull/28051/changes
-        if w < h:
-            w, h = h, w
-            angle -= 90
-
-        # Normalize angle to be within [-90, 90]
-        while angle <= -90:
-            angle += 180
-        while angle > 90:
-            angle -= 180
-
-        if h > 0:
-            if w / h > ratio_threshold_for_lines:  # select only contours with ratio like lines
-                angles.append(angle)
-            elif w / h < 1 / ratio_threshold_for_lines:  # if lines are vertical, substract 90 degree
-                angles.append(angle - 90)
-
-    if len(angles) == 0:
-        skew_angle = 0  # in case no angles is found
-    else:
-        # median_low picks a value from the data to avoid outliers
-        median = -median_low(angles)
-        skew_angle = -round(median) if abs(median) != 0 else 0
-
-        # Resolve the 90-degree flip ambiguity.
-        # If the estimation is exactly 90/-90, it's usually a vertical detection of horizontal lines.
-        if abs(skew_angle) == 90:
-            skew_angle = 0
-
-    # combine with the general orientation and the estimated angle
-    # Apply the detected skew to our base orientation
-    final_angle = base_angle + skew_angle
-
-    # Standardize result to [-179, 180] range to handle wrap-around cases (e.g., 180 + -31)
-    while final_angle > 180:
-        final_angle -= 360
-    while final_angle <= -180:
-        final_angle += 360
-
-    if is_confident:
-        # If the estimated angle is perpendicular, treat it as 0 to avoid wrong flips
-        if abs(skew_angle) % 90 == 0:
-            return page_orientation
-
-        # special case where the estimated angle is mostly wrong:
-        # case 1: - and + swapped
-        # case 2: estimated angle is completely wrong
-        # so in this case we prefer the general page orientation
-        if abs(skew_angle) == abs(page_orientation) and page_orientation != 0:
-            return page_orientation
-
-    return int(
-        final_angle
-    )  # return the clockwise angle (negative - left side rotation, positive - right side rotation)
+    angles = _compute_contour_angles(contours, n_ct, ratio_threshold_for_lines)
+    skew_angle = _compute_median_skew_angle(angles)
+    return _resolve_final_angle(base_angle, skew_angle, is_confident, page_orientation)
 
 
 def rectify_crops(

diff --git a/doctr/models/detection/_utils/base.py b/doctr/models/detection/_utils/base.py
@@ -9,6 +9,35 @@
 __all__ = ["_remove_padding"]
 
 
+def _adjust_coords(
+    loc_pred: np.ndarray,
+    ratio: float,
+    symmetric_pad: bool,
+    assume_straight_pages: bool,
+    axis: int,
+) -> None:
+    """Adjust coordinates along a given axis to remove padding
+
+    Args:
+        loc_pred: localization predictions
+        ratio: aspect ratio multiplier
+        symmetric_pad: whether the padding was symmetric
+        assume_straight_pages: whether the pages are assumed to be straight
+        axis: 0 for x coordinates, 1 for y coordinates
+    """
+    if assume_straight_pages:
+        cols = [axis, axis + 2]
+        if symmetric_pad:
+            loc_pred[:, cols] = (loc_pred[:, cols] - 0.5) * ratio + 0.5
+        else:
+            loc_pred[:, cols] *= ratio
+    else:
+        if symmetric_pad:
+            loc_pred[:, :, axis] = (loc_pred[:, :, axis] - 0.5) * ratio + 0.5
+        else:
+            loc_pred[:, :, axis] *= ratio
+
+
 def _remove_padding(
     pages: list[np.ndarray],
     loc_preds: list[dict[str, np.ndarray]],
@@ -29,35 +58,14 @@
         list of unpaded localization predictions
     """
     if preserve_aspect_ratio:
-        # Rectify loc_preds to remove padding
         rectified_preds = []
         for page, dict_loc_preds in zip(pages, loc_preds):
             for k, loc_pred in dict_loc_preds.items():
                 h, w = page.shape[0], page.shape[1]
                 if h > w:
-                    # y unchanged, dilate x coord
-                    if symmetric_pad:
-                        if assume_straight_pages:
-                            loc_pred[:, [0, 2]] = (loc_pred[:, [0, 2]] - 0.5) * h / w + 0.5
-                        else:
-                            loc_pred[:, :, 0] = (loc_pred[:, :, 0] - 0.5) * h / w + 0.5
-                    else:
-                        if assume_straight_pages:
-                            loc_pred[:, [0, 2]] *= h / w
-                        else:
-                            loc_pred[:, :, 0] *= h / w
+                    _adjust_coords(loc_pred, h / w, symmetric_pad, assume_straight_pages, axis=0)
                 elif w > h:
-                    # x unchanged, dilate y coord
-                    if symmetric_pad:
-                        if assume_straight_pages:
-                            loc_pred[:, [1, 3]] = (loc_pred[:, [1, 3]] - 0.5) * w / h + 0.5
-                        else:
-                            loc_pred[:, :, 1] = (loc_pred[:, :, 1] - 0.5) * w / h + 0.5
-                    else:
-                        if assume_straight_pages:
-                            loc_pred[:, [1, 3]] *= w / h
-                        else:
-                            loc_pred[:, :, 1] *= w / h
+                    _adjust_coords(loc_pred, w / h, symmetric_pad, assume_straight_pages, axis=1)
                 rectified_preds.append({k: np.clip(loc_pred, 0, 1)})
         return rectified_preds
     return loc_preds