From 44a17ad7713d0f8746a6c4e8fe0176c7c6ad963f Mon Sep 17 00:00:00 2001 From: Andreas Knuth Date: Wed, 26 Mar 2025 17:42:14 +0100 Subject: [PATCH] merge_text_blocks --- ocr_server.py | 189 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 182 insertions(+), 7 deletions(-) diff --git a/ocr_server.py b/ocr_server.py index 6d10c1b..dca5c67 100644 --- a/ocr_server.py +++ b/ocr_server.py @@ -13,11 +13,33 @@ import datetime import shutil from functools import wraps -logging.basicConfig( - level=logging.DEBUG, - format='%(asctime)s - %(levelname)s - %(message)s' -) +# logging.basicConfig( +# level=logging.DEBUG, +# format='%(asctime)s - %(levelname)s - %(message)s', +# handlers=[ +# logging.FileHandler('debug.log'), # Logs in Datei schreiben +# logging.StreamHandler() # Logs in Konsole anzeigen +# ] +# ) logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +# Handler für Datei +file_handler = logging.FileHandler('debug.log') +file_handler.setLevel(logging.DEBUG) +file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + +# Handler für Konsole +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) +console_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) + +# Handler hinzufügen +logger.handlers = [] # Bestehende Handler entfernen +logger.addHandler(file_handler) +logger.addHandler(console_handler) +logger.propagate = False # Verhindern, dass Logs an den Root-Logger weitergeleitet werden + app = Flask(__name__) @@ -79,6 +101,145 @@ def require_localhost(f): return f(*args, **kwargs) return decorated_function + +def merge_text_blocks(ocr_results, y_threshold=100, x_column_threshold=50): + """ + Merge OCR-detected lines into text blocks by first grouping into columns and then merging within columns. + + Args: + ocr_results: List of OCR results, where each item is [box, (text, confidence)] + y_threshold: Maximum vertical distance (in pixels) between lines to consider them part of the same block + x_column_threshold: Maximum horizontal distance (in pixels) between left edges to consider boxes in the same column + + Returns: + List of merged text blocks with updated boxes, text, and confidence + """ + if not ocr_results: + logger.debug("No OCR results to process.") + return [] + + # Schritt 1: Boxen in Spalten einordnen + # Sortiere Boxen nach der x-Koordinate der linken Kante, um Spalten zu identifizieren + sorted_by_x = sorted(ocr_results, key=lambda x: min([point[0] for point in x[0]])) + + columns = [] + current_column = [sorted_by_x[0]] + + for i in range(1, len(sorted_by_x)): + prev_box = current_column[-1][0] + curr_box = sorted_by_x[i][0] + + # Berechne den horizontalen Abstand zwischen den linken Kanten + prev_left_x = min([point[0] for point in prev_box]) + curr_left_x = min([point[0] for point in curr_box]) + x_distance = abs(curr_left_x - prev_left_x) + + if x_distance <= x_column_threshold: + # Box gehört zur aktuellen Spalte + current_column.append(sorted_by_x[i]) + else: + # Neue Spalte beginnen + columns.append(current_column) + current_column = [sorted_by_x[i]] + + # Letzte Spalte hinzufügen + if current_column: + columns.append(current_column) + + # Schritt 2: Debug-Ausgabe der Spalten + logger.debug(f"Found {len(columns)} columns:") + for col_idx, column in enumerate(columns): + logger.debug(f"Column {col_idx + 1}:") + for box_idx, item in enumerate(column): + box = item[0] + text = item[1][0] + confidence = item[1][1] + left_x = min([point[0] for point in box]) + top_y = min([point[1] for point in box]) + logger.debug(f" Box {box_idx + 1}: Text='{text}', Confidence={confidence:.2f}, LeftX={left_x:.2f}, TopY={top_y:.2f}, Box={box}") + + # Schritt 3: Innerhalb jeder Spalte Boxen nach y-Koordinaten sortieren und Blöcke bilden + final_results = [] + for col_idx, column in enumerate(columns): + logger.debug(f"\nProcessing Column {col_idx + 1} for merging into blocks:") + + # Sortiere Boxen in der Spalte nach y-Koordinate (top-left corner) + sorted_column = sorted(column, key=lambda x: min([point[1] for point in x[0]])) + + # Merge Boxen innerhalb der Spalte basierend auf dem vertikalen Abstand + current_block = { + 'box': sorted_column[0][0], + 'text': sorted_column[0][1][0], + 'confidence': sorted_column[0][1][1] + } + + for i in range(1, len(sorted_column)): + prev_box = current_block['box'] + curr_box = sorted_column[i][0] + curr_text = sorted_column[i][1][0] + curr_confidence = sorted_column[i][1][1] + + # Berechne den vertikalen Abstand zwischen der unteren Kante der vorherigen Box + # und der oberen Kante der aktuellen Box + prev_bottom_y = max([point[1] for point in prev_box]) + curr_top_y = min([point[1] for point in curr_box]) + y_distance = curr_top_y - prev_bottom_y + + # Logge den vertikalen Abstand + logger.debug(f"Comparing boxes {i-1} and {i} in Column {col_idx + 1}:") + logger.debug(f" Previous text: {current_block['text']}") + logger.debug(f" Current text: {curr_text}") + logger.debug(f" Vertical distance (y_distance): {y_distance:.2f}") + logger.debug(f" y_threshold: {y_threshold}") + + # Merge, wenn der vertikale Abstand klein genug ist + if y_distance <= y_threshold: + logger.debug(" Merging boxes into a single block.") + # Aktualisiere die Bounding-Box, um beide Boxen zu umfassen + all_points = prev_box + curr_box + min_x = min([point[0] for point in all_points]) + min_y = min([point[1] for point in all_points]) + max_x = max([point[0] for point in all_points]) + max_y = max([point[1] for point in all_points]) + current_block['box'] = [[min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y]] + + # Kombiniere den Text mit einem Leerzeichen + current_block['text'] += " " + curr_text + + # Aktualisiere die Confidence (z. B. Durchschnitt) + current_block['confidence'] = (current_block['confidence'] + curr_confidence) / 2 + else: + logger.debug(" Not merging boxes—starting a new block.") + # Wenn nicht gemerged wird, füge den aktuellen Block zu den Ergebnissen hinzu + final_results.append([ + current_block['box'], + (current_block['text'], current_block['confidence']) + ]) + current_block = { + 'box': curr_box, + 'text': curr_text, + 'confidence': curr_confidence + } + + # Füge den letzten Block der Spalte hinzu + final_results.append([ + current_block['box'], + (current_block['text'], current_block['confidence']) + ]) + + # Debug-Ausgabe der finalen Blöcke + logger.debug("\nFinal merged blocks:") + for idx, item in enumerate(final_results): + box = item[0] + text = item[1][0] + confidence = item[1][1] + logger.debug(f"Block {idx + 1}: Text='{text}', Confidence={confidence:.2f}, Box={box}") + + return final_results + + + +# Update the ocr_endpoint function to use the merge_text_blocks function @app.route('/api/ocr', methods=['POST']) def ocr_endpoint(): debug_dir = None @@ -104,6 +265,17 @@ def ocr_endpoint(): processed_image = preprocess_image(np.array(webp_image), debug_dir) # OCR mit optimierter Konfiguration + # ocr = PaddleOCR( + # use_angle_cls=True, + # lang='en', + # det_model_dir='en_PP-OCRv3_det', + # rec_model_dir='en_PP-OCRv3_rec', + # det_limit_side_len=processed_image.shape[0] * 2, + # use_dilation=True, + # det_db_score_mode='fast', + # det_db_box_thresh=0.3, # Adjusted parameter + # det_db_unclip_ratio=2.5 # Adjusted parameter + # ) ocr = PaddleOCR( use_angle_cls=True, lang='en', @@ -113,7 +285,6 @@ def ocr_endpoint(): use_dilation=True, det_db_score_mode='fast' ) - # OCR durchführen try: result = ocr.ocr(processed_image, rec=True, cls=True) @@ -136,9 +307,12 @@ def ocr_endpoint(): 'debug_dir': debug_dir }), 200 + # Merge text blocks + merged_results = merge_text_blocks(result[0], y_threshold=15, x_column_threshold=50) + # Ergebnisse verarbeiten extracted_results = [] - for idx, item in enumerate(result[0]): + for idx, item in enumerate(merged_results): try: box = item[0] text = item[1][0] if item[1] else '' @@ -189,6 +363,7 @@ def ocr_endpoint(): 'debug_dir': dir_name if debug_dir else None }), 500 + @app.route('/api/cleanup', methods=['POST']) @require_localhost def cleanup_endpoint(): @@ -266,4 +441,4 @@ def cleanup_endpoint(): }), 500 if __name__ == '__main__': - app.run(host='0.0.0.0', port=5000, debug=False) + app.run(host='0.0.0.0', port=5000, debug=True)