Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 40 additions & 30 deletions doctr/datasets/coco_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,40 +102,50 @@ def __init__(

for annotation in annotations:
x, y, w, h = annotation["bbox"]
if use_polygons:
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
box = np.array(
[
[x, y],
[x + w, y],
[x + w, y + h],
[x, y + h],
],
dtype=np_dtype,
)
else:
# (xmin, ymin, xmax, ymax) coordinates
box = [x, y, x + w, y + h]
box = self._build_box(x, y, w, h, use_polygons, np_dtype)
_targets.append((annotation["utf8_string"], box))
text_targets, box_targets = zip(*_targets)

if recognition_task:
crops = crop_bboxes_from_image(
img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
)
for crop, label in zip(crops, list(text_targets)):
if label and " " not in label:
self.data.append((crop, label))

elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
else:
self.data.append((
img_path,
dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
))
self._process_task_sample(img_path, text_targets, box_targets, recognition_task, detection_task, tmp_root)

self.root = tmp_root

@staticmethod
def _build_box(x: float, y: float, w: float, h: float, use_polygons: bool, np_dtype: type) -> list[float] | np.ndarray:
if use_polygons:
return np.array(
[
[x, y],
[x + w, y],
[x + w, y + h],
[x, y + h],
],
dtype=np_dtype,
)
return [x, y, x + w, y + h]

def _process_task_sample(
self,
img_path: str,
text_targets: tuple[str, ...],
box_targets: tuple[list[float] | np.ndarray, ...],
recognition_task: bool,
detection_task: bool,
tmp_root: str,
) -> None:
if recognition_task:
crops = crop_bboxes_from_image(
img_path=os.path.join(tmp_root, img_path), geoms=np.asarray(box_targets, dtype=int).clip(min=0)
)
for crop, label in zip(crops, list(text_targets)):
if label and " " not in label:
self.data.append((crop, label))
elif detection_task:
self.data.append((img_path, np.asarray(box_targets, dtype=int).clip(min=0)))
else:
self.data.append((
img_path,
dict(boxes=np.asarray(box_targets, dtype=int).clip(min=0), labels=list(text_targets)),
))

def extra_repr(self) -> str:
return f"train={self.train}"
57 changes: 32 additions & 25 deletions doctr/datasets/cord.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,31 +84,7 @@ def __init__(
if not os.path.exists(os.path.join(tmp_root, img_path)):
raise FileNotFoundError(f"unable to locate {os.path.join(tmp_root, img_path)}")

stem = Path(img_path).stem
_targets = []
with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f:
label = json.load(f)
for line in label["valid_line"]:
for word in line["words"]:
if len(word["text"]) > 0:
x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
box: list[float] | np.ndarray
if use_polygons:
# (x, y) coordinates of top left, top right, bottom right, bottom left corners
box = np.array(
[
[x[0], y[0]],
[x[1], y[1]],
[x[2], y[2]],
[x[3], y[3]],
],
dtype=np_dtype,
)
else:
# Reduce 8 coords to 4 -> xmin, ymin, xmax, ymax
box = [min(x), min(y), max(x), max(y)]
_targets.append((word["text"], box))
_targets = self._process_image(img_path, tmp_root, use_polygons, np_dtype)

text_targets, box_targets = zip(*_targets)

Expand All @@ -129,5 +105,36 @@ def __init__(

self.root = tmp_root

def _process_image(
self,
img_path: str,
tmp_root: str,
use_polygons: bool,
np_dtype: np.dtype,
) -> list[tuple[str, list[float] | np.ndarray]]:
stem = Path(img_path).stem
_targets: list[tuple[str, list[float] | np.ndarray]] = []
with open(os.path.join(self.root, "json", f"{stem}.json"), "rb") as f:
label = json.load(f)
for line in label["valid_line"]:
for word in line["words"]:
if len(word["text"]) > 0:
x = word["quad"]["x1"], word["quad"]["x2"], word["quad"]["x3"], word["quad"]["x4"]
y = word["quad"]["y1"], word["quad"]["y2"], word["quad"]["y3"], word["quad"]["y4"]
if use_polygons:
box = np.array(
[
[x[0], y[0]],
[x[1], y[1]],
[x[2], y[2]],
[x[3], y[3]],
],
dtype=np_dtype,
)
else:
box = [min(x), min(y), max(x), max(y)]
_targets.append((word["text"], box))
return _targets

def extra_repr(self) -> str:
return f"train={self.train}"
124 changes: 56 additions & 68 deletions doctr/models/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,58 @@ def get_max_width_length_ratio(contour: np.ndarray) -> float:
return max(w / h, h / w)


def _compute_contour_angles(
contours: list[np.ndarray],
n_ct: int,
ratio_threshold_for_lines: float,
) -> list[float]:
angles = []
for contour in contours[:n_ct]:
_, (w, h), angle = cv2.minAreaRect(contour)
if w < h:
w, h = h, w
angle -= 90
while angle <= -90:
angle += 180
while angle > 90:
angle -= 180
if h > 0:
if w / h > ratio_threshold_for_lines:
angles.append(angle)
elif w / h < 1 / ratio_threshold_for_lines:
angles.append(angle - 90)
return angles


def _compute_median_skew_angle(angles: list[float]) -> int:
if len(angles) == 0:
return 0
median = -median_low(angles)
skew_angle = -round(median) if abs(median) != 0 else 0
if abs(skew_angle) == 90:
skew_angle = 0
return skew_angle


def _resolve_final_angle(
base_angle: int,
skew_angle: int,
is_confident: bool,
page_orientation: int,
) -> int:
final_angle = base_angle + skew_angle
while final_angle > 180:
final_angle -= 360
while final_angle <= -180:
final_angle += 360
if is_confident:
if abs(skew_angle) % 90 == 0:
return page_orientation
if abs(skew_angle) == abs(page_orientation) and page_orientation != 0:
return page_orientation
return int(final_angle)


def estimate_orientation(
img: np.ndarray,
general_page_orientation: tuple[int, float] | None = None,
Expand All @@ -56,7 +108,6 @@ def estimate_orientation(
"""
assert len(img.shape) == 3 and img.shape[-1] in [1, 3], f"Image shape {img.shape} not supported"

# Convert image to grayscale if necessary
if img.shape[-1] == 3:
gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray_img = cv2.medianBlur(gray_img, 5)
Expand All @@ -69,87 +120,24 @@ def estimate_orientation(
base_angle = page_orientation if is_confident else 0

if is_confident:
# We rotate the image to the general orientation which improves the detection
# No expand needed bitmap is already padded
thresh = rotate_image(thresh, -base_angle)
else: # That's only required if we do not work on the detection models bin map
# try to merge words in lines
else:
(h, w) = img.shape[:2]
k_x = max(1, (floor(w / 100)))
k_y = max(1, (floor(h / 100)))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (k_x, k_y))
thresh = cv2.dilate(thresh, kernel, iterations=1)

# extract contours
contours, _ = cv2.findContours(thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

# Filter & Sort contours
contours = sorted(
[contour for contour in contours if cv2.contourArea(contour) > lower_area],
key=get_max_width_length_ratio,
reverse=True,
)

angles = []
for contour in contours[:n_ct]:
_, (w, h), angle = cv2.minAreaRect(contour)

# OpenCV version-proof normalization: force 'w' to be the long side
# so the angle is consistently relative to the major axis.
# https://github.com/opencv/opencv/pull/28051/changes
if w < h:
w, h = h, w
angle -= 90

# Normalize angle to be within [-90, 90]
while angle <= -90:
angle += 180
while angle > 90:
angle -= 180

if h > 0:
if w / h > ratio_threshold_for_lines: # select only contours with ratio like lines
angles.append(angle)
elif w / h < 1 / ratio_threshold_for_lines: # if lines are vertical, substract 90 degree
angles.append(angle - 90)

if len(angles) == 0:
skew_angle = 0 # in case no angles is found
else:
# median_low picks a value from the data to avoid outliers
median = -median_low(angles)
skew_angle = -round(median) if abs(median) != 0 else 0

# Resolve the 90-degree flip ambiguity.
# If the estimation is exactly 90/-90, it's usually a vertical detection of horizontal lines.
if abs(skew_angle) == 90:
skew_angle = 0

# combine with the general orientation and the estimated angle
# Apply the detected skew to our base orientation
final_angle = base_angle + skew_angle

# Standardize result to [-179, 180] range to handle wrap-around cases (e.g., 180 + -31)
while final_angle > 180:
final_angle -= 360
while final_angle <= -180:
final_angle += 360

if is_confident:
# If the estimated angle is perpendicular, treat it as 0 to avoid wrong flips
if abs(skew_angle) % 90 == 0:
return page_orientation

# special case where the estimated angle is mostly wrong:
# case 1: - and + swapped
# case 2: estimated angle is completely wrong
# so in this case we prefer the general page orientation
if abs(skew_angle) == abs(page_orientation) and page_orientation != 0:
return page_orientation

return int(
final_angle
) # return the clockwise angle (negative - left side rotation, positive - right side rotation)
angles = _compute_contour_angles(contours, n_ct, ratio_threshold_for_lines)
skew_angle = _compute_median_skew_angle(angles)
return _resolve_final_angle(base_angle, skew_angle, is_confident, page_orientation)


def rectify_crops(
Expand Down
54 changes: 31 additions & 23 deletions doctr/models/detection/_utils/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,35 @@
__all__ = ["_remove_padding"]


def _adjust_coords(
loc_pred: np.ndarray,
ratio: float,
symmetric_pad: bool,
assume_straight_pages: bool,
axis: int,
) -> None:
"""Adjust coordinates along a given axis to remove padding

Check notice on line 19 in doctr/models/detection/_utils/base.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/models/detection/_utils/base.py#L19

First line should end with a period, question mark, or exclamation point (not 'g') (D415)

Check notice on line 19 in doctr/models/detection/_utils/base.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/models/detection/_utils/base.py#L19

Missing blank line after last section ('Args') (D413)

Check notice on line 19 in doctr/models/detection/_utils/base.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/models/detection/_utils/base.py#L19

Missing dashed underline after section ('Args') (D407)

Check notice on line 19 in doctr/models/detection/_utils/base.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

doctr/models/detection/_utils/base.py#L19

Multi-line docstring summary should start at the second line (D213)

Args:
loc_pred: localization predictions
ratio: aspect ratio multiplier
symmetric_pad: whether the padding was symmetric
assume_straight_pages: whether the pages are assumed to be straight
axis: 0 for x coordinates, 1 for y coordinates
"""
if assume_straight_pages:
cols = [axis, axis + 2]
if symmetric_pad:
loc_pred[:, cols] = (loc_pred[:, cols] - 0.5) * ratio + 0.5
else:
loc_pred[:, cols] *= ratio
else:
if symmetric_pad:
loc_pred[:, :, axis] = (loc_pred[:, :, axis] - 0.5) * ratio + 0.5
else:
loc_pred[:, :, axis] *= ratio


def _remove_padding(
pages: list[np.ndarray],
loc_preds: list[dict[str, np.ndarray]],
Expand All @@ -29,35 +58,14 @@
list of unpaded localization predictions
"""
if preserve_aspect_ratio:
# Rectify loc_preds to remove padding
rectified_preds = []
for page, dict_loc_preds in zip(pages, loc_preds):
for k, loc_pred in dict_loc_preds.items():
h, w = page.shape[0], page.shape[1]
if h > w:
# y unchanged, dilate x coord
if symmetric_pad:
if assume_straight_pages:
loc_pred[:, [0, 2]] = (loc_pred[:, [0, 2]] - 0.5) * h / w + 0.5
else:
loc_pred[:, :, 0] = (loc_pred[:, :, 0] - 0.5) * h / w + 0.5
else:
if assume_straight_pages:
loc_pred[:, [0, 2]] *= h / w
else:
loc_pred[:, :, 0] *= h / w
_adjust_coords(loc_pred, h / w, symmetric_pad, assume_straight_pages, axis=0)
elif w > h:
# x unchanged, dilate y coord
if symmetric_pad:
if assume_straight_pages:
loc_pred[:, [1, 3]] = (loc_pred[:, [1, 3]] - 0.5) * w / h + 0.5
else:
loc_pred[:, :, 1] = (loc_pred[:, :, 1] - 0.5) * w / h + 0.5
else:
if assume_straight_pages:
loc_pred[:, [1, 3]] *= w / h
else:
loc_pred[:, :, 1] *= w / h
_adjust_coords(loc_pred, w / h, symmetric_pad, assume_straight_pages, axis=1)
rectified_preds.append({k: np.clip(loc_pred, 0, 1)})
return rectified_preds
return loc_preds
Loading