merge_text_blocks

This commit is contained in:
Andreas Knuth 2025-03-26 17:42:14 +01:00
parent 8b347d3a4e
commit 44a17ad771
1 changed files with 182 additions and 7 deletions

View File

@ -13,11 +13,33 @@ import datetime
import shutil import shutil
from functools import wraps from functools import wraps
logging.basicConfig( # logging.basicConfig(
level=logging.DEBUG, # level=logging.DEBUG,
format='%(asctime)s - %(levelname)s - %(message)s' # format='%(asctime)s - %(levelname)s - %(message)s',
) # handlers=[
# logging.FileHandler('debug.log'), # Logs in Datei schreiben
# logging.StreamHandler() # Logs in Konsole anzeigen
# ]
# )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
# Handler für Datei
file_handler = logging.FileHandler('debug.log')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
# Handler für Konsole
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
# Handler hinzufügen
logger.handlers = [] # Bestehende Handler entfernen
logger.addHandler(file_handler)
logger.addHandler(console_handler)
logger.propagate = False # Verhindern, dass Logs an den Root-Logger weitergeleitet werden
app = Flask(__name__) app = Flask(__name__)
@ -79,6 +101,145 @@ def require_localhost(f):
return f(*args, **kwargs) return f(*args, **kwargs)
return decorated_function return decorated_function
def merge_text_blocks(ocr_results, y_threshold=100, x_column_threshold=50):
"""
Merge OCR-detected lines into text blocks by first grouping into columns and then merging within columns.
Args:
ocr_results: List of OCR results, where each item is [box, (text, confidence)]
y_threshold: Maximum vertical distance (in pixels) between lines to consider them part of the same block
x_column_threshold: Maximum horizontal distance (in pixels) between left edges to consider boxes in the same column
Returns:
List of merged text blocks with updated boxes, text, and confidence
"""
if not ocr_results:
logger.debug("No OCR results to process.")
return []
# Schritt 1: Boxen in Spalten einordnen
# Sortiere Boxen nach der x-Koordinate der linken Kante, um Spalten zu identifizieren
sorted_by_x = sorted(ocr_results, key=lambda x: min([point[0] for point in x[0]]))
columns = []
current_column = [sorted_by_x[0]]
for i in range(1, len(sorted_by_x)):
prev_box = current_column[-1][0]
curr_box = sorted_by_x[i][0]
# Berechne den horizontalen Abstand zwischen den linken Kanten
prev_left_x = min([point[0] for point in prev_box])
curr_left_x = min([point[0] for point in curr_box])
x_distance = abs(curr_left_x - prev_left_x)
if x_distance <= x_column_threshold:
# Box gehört zur aktuellen Spalte
current_column.append(sorted_by_x[i])
else:
# Neue Spalte beginnen
columns.append(current_column)
current_column = [sorted_by_x[i]]
# Letzte Spalte hinzufügen
if current_column:
columns.append(current_column)
# Schritt 2: Debug-Ausgabe der Spalten
logger.debug(f"Found {len(columns)} columns:")
for col_idx, column in enumerate(columns):
logger.debug(f"Column {col_idx + 1}:")
for box_idx, item in enumerate(column):
box = item[0]
text = item[1][0]
confidence = item[1][1]
left_x = min([point[0] for point in box])
top_y = min([point[1] for point in box])
logger.debug(f" Box {box_idx + 1}: Text='{text}', Confidence={confidence:.2f}, LeftX={left_x:.2f}, TopY={top_y:.2f}, Box={box}")
# Schritt 3: Innerhalb jeder Spalte Boxen nach y-Koordinaten sortieren und Blöcke bilden
final_results = []
for col_idx, column in enumerate(columns):
logger.debug(f"\nProcessing Column {col_idx + 1} for merging into blocks:")
# Sortiere Boxen in der Spalte nach y-Koordinate (top-left corner)
sorted_column = sorted(column, key=lambda x: min([point[1] for point in x[0]]))
# Merge Boxen innerhalb der Spalte basierend auf dem vertikalen Abstand
current_block = {
'box': sorted_column[0][0],
'text': sorted_column[0][1][0],
'confidence': sorted_column[0][1][1]
}
for i in range(1, len(sorted_column)):
prev_box = current_block['box']
curr_box = sorted_column[i][0]
curr_text = sorted_column[i][1][0]
curr_confidence = sorted_column[i][1][1]
# Berechne den vertikalen Abstand zwischen der unteren Kante der vorherigen Box
# und der oberen Kante der aktuellen Box
prev_bottom_y = max([point[1] for point in prev_box])
curr_top_y = min([point[1] for point in curr_box])
y_distance = curr_top_y - prev_bottom_y
# Logge den vertikalen Abstand
logger.debug(f"Comparing boxes {i-1} and {i} in Column {col_idx + 1}:")
logger.debug(f" Previous text: {current_block['text']}")
logger.debug(f" Current text: {curr_text}")
logger.debug(f" Vertical distance (y_distance): {y_distance:.2f}")
logger.debug(f" y_threshold: {y_threshold}")
# Merge, wenn der vertikale Abstand klein genug ist
if y_distance <= y_threshold:
logger.debug(" Merging boxes into a single block.")
# Aktualisiere die Bounding-Box, um beide Boxen zu umfassen
all_points = prev_box + curr_box
min_x = min([point[0] for point in all_points])
min_y = min([point[1] for point in all_points])
max_x = max([point[0] for point in all_points])
max_y = max([point[1] for point in all_points])
current_block['box'] = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]]
# Kombiniere den Text mit einem Leerzeichen
current_block['text'] += " " + curr_text
# Aktualisiere die Confidence (z. B. Durchschnitt)
current_block['confidence'] = (current_block['confidence'] + curr_confidence) / 2
else:
logger.debug(" Not merging boxes—starting a new block.")
# Wenn nicht gemerged wird, füge den aktuellen Block zu den Ergebnissen hinzu
final_results.append([
current_block['box'],
(current_block['text'], current_block['confidence'])
])
current_block = {
'box': curr_box,
'text': curr_text,
'confidence': curr_confidence
}
# Füge den letzten Block der Spalte hinzu
final_results.append([
current_block['box'],
(current_block['text'], current_block['confidence'])
])
# Debug-Ausgabe der finalen Blöcke
logger.debug("\nFinal merged blocks:")
for idx, item in enumerate(final_results):
box = item[0]
text = item[1][0]
confidence = item[1][1]
logger.debug(f"Block {idx + 1}: Text='{text}', Confidence={confidence:.2f}, Box={box}")
return final_results
# Update the ocr_endpoint function to use the merge_text_blocks function
@app.route('/api/ocr', methods=['POST']) @app.route('/api/ocr', methods=['POST'])
def ocr_endpoint(): def ocr_endpoint():
debug_dir = None debug_dir = None
@ -104,6 +265,17 @@ def ocr_endpoint():
processed_image = preprocess_image(np.array(webp_image), debug_dir) processed_image = preprocess_image(np.array(webp_image), debug_dir)
# OCR mit optimierter Konfiguration # OCR mit optimierter Konfiguration
# ocr = PaddleOCR(
# use_angle_cls=True,
# lang='en',
# det_model_dir='en_PP-OCRv3_det',
# rec_model_dir='en_PP-OCRv3_rec',
# det_limit_side_len=processed_image.shape[0] * 2,
# use_dilation=True,
# det_db_score_mode='fast',
# det_db_box_thresh=0.3, # Adjusted parameter
# det_db_unclip_ratio=2.5 # Adjusted parameter
# )
ocr = PaddleOCR( ocr = PaddleOCR(
use_angle_cls=True, use_angle_cls=True,
lang='en', lang='en',
@ -113,7 +285,6 @@ def ocr_endpoint():
use_dilation=True, use_dilation=True,
det_db_score_mode='fast' det_db_score_mode='fast'
) )
# OCR durchführen # OCR durchführen
try: try:
result = ocr.ocr(processed_image, rec=True, cls=True) result = ocr.ocr(processed_image, rec=True, cls=True)
@ -136,9 +307,12 @@ def ocr_endpoint():
'debug_dir': debug_dir 'debug_dir': debug_dir
}), 200 }), 200
# Merge text blocks
merged_results = merge_text_blocks(result[0], y_threshold=15, x_column_threshold=50)
# Ergebnisse verarbeiten # Ergebnisse verarbeiten
extracted_results = [] extracted_results = []
for idx, item in enumerate(result[0]): for idx, item in enumerate(merged_results):
try: try:
box = item[0] box = item[0]
text = item[1][0] if item[1] else '' text = item[1][0] if item[1] else ''
@ -189,6 +363,7 @@ def ocr_endpoint():
'debug_dir': dir_name if debug_dir else None 'debug_dir': dir_name if debug_dir else None
}), 500 }), 500
@app.route('/api/cleanup', methods=['POST']) @app.route('/api/cleanup', methods=['POST'])
@require_localhost @require_localhost
def cleanup_endpoint(): def cleanup_endpoint():
@ -266,4 +441,4 @@ def cleanup_endpoint():
}), 500 }), 500
if __name__ == '__main__': if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000, debug=False) app.run(host='0.0.0.0', port=5000, debug=True)