东方市护送信息网

Windows和Linux下使用Python搭建一个图片OCR工具

2026-04-07 07:07:01 浏览次数:0
详细信息

项目结构

ocr-tool/
├── main.py              # 主程序
├── requirements.txt     # 依赖包
├── config.yaml         # 配置文件
├── gui.py             # 图形界面
├── cli.py             # 命令行界面
├── utils/
│   ├── image_processor.py
│   └── ocr_engine.py
└── README.md

1. 环境配置

requirements.txt

Pillow>=9.0.0
pytesseract>=0.3.10
opencv-python>=4.8.0
paddlepaddle>=2.5.0
paddleocr>=2.7.0
pyqt5>=5.15.0
pyyaml>=6.0
numpy>=1.24.0
python-dotenv>=1.0.0

系统依赖安装

Windows:

# 安装Tesseract OCR
# 下载地址: https://github.com/UB-Mannheim/tesseract/wiki
# 安装后需要将Tesseract添加到系统PATH

# 安装Python依赖
pip install -r requirements.txt

Linux (Ubuntu/Debian):

# 安装系统依赖
sudo apt update
sudo apt install -y tesseract-ocr tesseract-ocr-chi-sim
sudo apt install -y libgl1-mesa-glx libglib2.0-0

# 安装Python依赖
pip install -r requirements.txt

2. 核心代码实现

main.py - 主程序入口

#!/usr/bin/env python3
"""
跨平台OCR工具主程序
支持Windows和Linux系统
"""

import sys
import os
from pathlib import Path

def setup_environment():
    """设置跨平台环境"""
    if sys.platform == 'win32':
        # Windows下设置Tesseract路径
        tesseract_path = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
        if os.path.exists(tesseract_path):
            os.environ['TESSERACT_CMD'] = tesseract_path

    # 创建必要的目录
    Path('output').mkdir(exist_ok=True)
    Path('logs').mkdir(exist_ok=True)

def main():
    """主程序入口"""
    setup_environment()

    if len(sys.argv) > 1 and sys.argv[1] == '--cli':
        from cli import main as cli_main
        cli_main()
    else:
        from gui import main as gui_main
        gui_main()

if __name__ == '__main__':
    main()

utils/ocr_engine.py - OCR引擎封装

import pytesseract
import cv2
import numpy as np
from PIL import Image
import logging
from typing import List, Tuple, Optional, Dict
import os

logger = logging.getLogger(__name__)

class OCREngine:
    """OCR引擎封装类"""

    def __init__(self, lang='eng+chi_sim', engine='tesseract'):
        """
        初始化OCR引擎

        Args:
            lang: 语言设置,如 'eng', 'chi_sim', 'eng+chi_sim'
            engine: OCR引擎类型 ('tesseract' 或 'paddle')
        """
        self.lang = lang
        self.engine_type = engine
        self.paddle_ocr = None

        if engine == 'paddle':
            try:
                from paddleocr import PaddleOCR
                self.paddle_ocr = PaddleOCR(use_angle_cls=True, lang='ch')
                logger.info("PaddleOCR引擎初始化成功")
            except ImportError:
                logger.warning("PaddleOCR未安装,将使用Tesseract")
                self.engine_type = 'tesseract'

        logger.info(f"OCR引擎初始化完成,类型: {self.engine_type}, 语言: {lang}")

    def preprocess_image(self, image_path: str) -> np.ndarray:
        """
        图像预处理

        Args:
            image_path: 图像路径

        Returns:
            预处理后的图像数组
        """
        try:
            # 读取图像
            if isinstance(image_path, str):
                img = cv2.imread(image_path)
            else:
                img = image_path

            if img is None:
                raise ValueError("无法读取图像")

            # 转换为灰度图
            if len(img.shape) == 3:
                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            else:
                gray = img

            # 应用自适应阈值
            processed = cv2.adaptiveThreshold(
                gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY, 11, 2
            )

            # 降噪
            processed = cv2.medianBlur(processed, 3)

            return processed

        except Exception as e:
            logger.error(f"图像预处理失败: {e}")
            raise

    def ocr_with_tesseract(self, image_path: str) -> Dict:
        """
        使用Tesseract进行OCR识别

        Args:
            image_path: 图像路径

        Returns:
            识别结果字典
        """
        try:
            # 预处理图像
            processed_img = self.preprocess_image(image_path)

            # 转换为PIL Image
            pil_img = Image.fromarray(processed_img)

            # 配置Tesseract参数
            custom_config = r'--oem 3 --psm 6'

            # 获取文本
            text = pytesseract.image_to_string(
                pil_img, 
                lang=self.lang,
                config=custom_config
            )

            # 获取详细数据
            data = pytesseract.image_to_data(
                pil_img, 
                lang=self.lang,
                config=custom_config,
                output_type=pytesseract.Output.DICT
            )

            result = {
                'text': text.strip(),
                'confidence': float(np.mean([float(x) for x in data['conf'] if int(x) > 0])),
                'engine': 'tesseract',
                'language': self.lang,
                'details': data
            }

            logger.info(f"Tesseract OCR识别完成,置信度: {result['confidence']:.2f}")
            return result

        except Exception as e:
            logger.error(f"Tesseract OCR失败: {e}")
            raise

    def ocr_with_paddle(self, image_path: str) -> Dict:
        """
        使用PaddleOCR进行识别

        Args:
            image_path: 图像路径

        Returns:
            识别结果字典
        """
        try:
            if self.paddle_ocr is None:
                raise ValueError("PaddleOCR未初始化")

            # 执行OCR
            result = self.paddle_ocr.ocr(image_path, cls=True)

            # 解析结果
            text_blocks = []
            all_text = []

            if result[0] is not None:
                for line in result[0]:
                    box = line[0]
                    text = line[1][0]
                    confidence = line[1][1]

                    text_blocks.append({
                        'bbox': box,
                        'text': text,
                        'confidence': confidence
                    })
                    all_text.append(text)

            result_text = '\n'.join(all_text)

            return {
                'text': result_text,
                'confidence': float(np.mean([b['confidence'] for b in text_blocks])) if text_blocks else 0.0,
                'engine': 'paddle',
                'language': 'ch',
                'details': text_blocks
            }

        except Exception as e:
            logger.error(f"PaddleOCR失败: {e}")
            raise

    def recognize(self, image_path: str) -> Dict:
        """
        执行OCR识别

        Args:
            image_path: 图像路径

        Returns:
            识别结果
        """
        logger.info(f"开始OCR识别: {image_path}")

        if self.engine_type == 'paddle':
            return self.ocr_with_paddle(image_path)
        else:
            return self.ocr_with_tesseract(image_path)

    def batch_recognize(self, image_paths: List[str]) -> List[Dict]:
        """
        批量OCR识别

        Args:
            image_paths: 图像路径列表

        Returns:
            识别结果列表
        """
        results = []
        for i, img_path in enumerate(image_paths, 1):
            try:
                logger.info(f"处理第 {i}/{len(image_paths)} 张图片: {img_path}")
                result = self.recognize(img_path)
                result['filename'] = os.path.basename(img_path)
                results.append(result)
            except Exception as e:
                logger.error(f"处理图片失败 {img_path}: {e}")
                results.append({
                    'filename': os.path.basename(img_path),
                    'error': str(e),
                    'text': ''
                })

        return results

utils/image_processor.py - 图像处理工具

import cv2
import numpy as np
from PIL import Image
import io
from typing import Union, Tuple
import logging

logger = logging.getLogger(__name__)

class ImageProcessor:
    """图像处理工具类"""

    @staticmethod
    def resize_image(image: np.ndarray, max_size: Tuple[int, int] = (2000, 2000)) -> np.ndarray:
        """
        调整图像大小

        Args:
            image: 输入图像
            max_size: 最大尺寸 (宽, 高)

        Returns:
            调整后的图像
        """
        height, width = image.shape[:2]
        max_width, max_height = max_size

        if width > max_width or height > max_height:
            scale = min(max_width / width, max_height / height)
            new_width = int(width * scale)
            new_height = int(height * scale)

            resized = cv2.resize(image, (new_width, new_height), 
                                interpolation=cv2.INTER_AREA)
            logger.info(f"图像从 {width}x{height} 调整到 {new_width}x{new_height}")
            return resized

        return image

    @staticmethod
    def enhance_image(image: np.ndarray) -> np.ndarray:
        """
        图像增强处理

        Args:
            image: 输入图像

        Returns:
            增强后的图像
        """
        # 转换为灰度图
        if len(image.shape) == 3:
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        else:
            gray = image.copy()

        # 直方图均衡化
        enhanced = cv2.equalizeHist(gray)

        # 锐化
        kernel = np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])
        enhanced = cv2.filter2D(enhanced, -1, kernel)

        return enhanced

    @staticmethod
    def deskew_image(image: np.ndarray) -> np.ndarray:
        """
        图像纠偏(自动旋转校正)

        Args:
            image: 输入图像

        Returns:
            纠偏后的图像
        """
        # 转换为二值图像
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image.copy()
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

        # 查找轮廓
        coords = np.column_stack(np.where(binary > 0))

        if len(coords) > 0:
            # 获取最小外接矩形
            angle = cv2.minAreaRect(coords)[-1]

            # 调整角度
            if angle < -45:
                angle = 90 + angle
            elif angle > 45:
                angle = angle - 90

            # 旋转图像
            if abs(angle) > 1.0:  # 只对明显倾斜的图像进行旋转
                (h, w) = image.shape[:2]
                center = (w // 2, h // 2)
                M = cv2.getRotationMatrix2D(center, angle, 1.0)
                rotated = cv2.warpAffine(image, M, (w, h), 
                                        flags=cv2.INTER_CUBIC,
                                        borderMode=cv2.BORDER_REPLICATE)
                logger.info(f"图像纠偏: 旋转 {angle:.2f} 度")
                return rotated

        return image

    @staticmethod
    def bytes_to_image(image_bytes: bytes) -> np.ndarray:
        """
        字节流转换为图像

        Args:
            image_bytes: 图像字节数据

        Returns:
            OpenCV图像数组
        """
        nparr = np.frombuffer(image_bytes, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
        return img

    @staticmethod
    def image_to_bytes(image: np.ndarray, format: str = 'PNG') -> bytes:
        """
        图像转换为字节流

        Args:
            image: OpenCV图像
            format: 图像格式

        Returns:
            字节数据
        """
        success, encoded_image = cv2.imencode(f'.{format.lower()}', image)
        if success:
            return encoded_image.tobytes()
        raise ValueError(f"图像编码失败: {format}")

config.yaml - 配置文件

# OCR工具配置
ocr:
  default_language: "eng+chi_sim"  # 默认语言
  default_engine: "tesseract"      # 默认引擎 (tesseract/paddle)
  tesseract_path: ""               # Tesseract路径(Windows需要设置)

image:
  max_width: 2000
  max_height: 2000
  enable_preprocess: true
  enable_deskew: true

output:
  default_format: "txt"           # 输出格式 (txt/json)
  save_to_file: true
  output_dir: "./output"

gui:
  window_width: 1200
  window_height: 800
  theme: "light"                  # light/dark

cli.py - 命令行界面

#!/usr/bin/env python3
"""
OCR工具命令行界面
"""

import argparse
import sys
import os
import json
from pathlib import Path
from datetime import datetime
import logging
from utils.ocr_engine import OCREngine
from utils.image_processor import ImageProcessor

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def save_results(results, output_dir='./output', format='txt'):
    """保存识别结果"""
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

    if format == 'json':
        output_file = Path(output_dir) / f'ocr_results_{timestamp}.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        logger.info(f"结果已保存到: {output_file}")

    elif format == 'txt':
        output_file = Path(output_dir) / f'ocr_results_{timestamp}.txt'
        with open(output_file, 'w', encoding='utf-8') as f:
            for result in results:
                if 'filename' in result:
                    f.write(f"=== 文件: {result['filename']} ===\n")
                if 'text' in result:
                    f.write(result['text'])
                    f.write("\n\n")
                if 'error' in result:
                    f.write(f"错误: {result['error']}\n\n")
        logger.info(f"结果已保存到: {output_file}")

def main():
    parser = argparse.ArgumentParser(description='OCR图片识别工具')
    parser.add_argument('input', nargs='+', help='输入图片路径或目录')
    parser.add_argument('-l', '--language', default='eng+chi_sim', 
                       help='OCR语言 (默认: eng+chi_sim)')
    parser.add_argument('-e', '--engine', default='tesseract', 
                       choices=['tesseract', 'paddle'],
                       help='OCR引擎 (默认: tesseract)')
    parser.add_argument('-o', '--output', default='./output',
                       help='输出目录 (默认: ./output)')
    parser.add_argument('-f', '--format', default='txt',
                       choices=['txt', 'json'],
                       help='输出格式 (默认: txt)')
    parser.add_argument('--preprocess', action='store_true',
                       help='启用图像预处理')
    parser.add_argument('--deskew', action='store_true',
                       help='启用图像纠偏')
    parser.add_argument('--batch', action='store_true',
                       help='批量处理模式')

    args = parser.parse_args()

    # 创建输出目录
    Path(args.output).mkdir(parents=True, exist_ok=True)

    # 收集要处理的图片
    image_paths = []
    for input_path in args.input:
        path = Path(input_path)
        if path.is_dir():
            # 遍历目录下的图片文件
            for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff']:
                image_paths.extend(path.glob(ext))
        elif path.is_file():
            image_paths.append(path)

    if not image_paths:
        logger.error("未找到图片文件")
        return 1

    # 初始化OCR引擎
    try:
        ocr = OCREngine(lang=args.language, engine=args.engine)
    except Exception as e:
        logger.error(f"OCR引擎初始化失败: {e}")
        return 1

    # 处理图片
    results = []
    processor = ImageProcessor()

    for img_path in image_paths:
        try:
            logger.info(f"处理图片: {img_path}")

            # 读取图片
            image = cv2.imread(str(img_path))
            if image is None:
                logger.error(f"无法读取图片: {img_path}")
                continue

            # 图像预处理
            if args.preprocess:
                image = processor.enhance_image(image)

            if args.deskew:
                image = processor.deskew_image(image)

            # 调整大小
            image = processor.resize_image(image)

            # OCR识别
            result = ocr.recognize(image)
            result['filename'] = img_path.name
            results.append(result)

            # 显示结果
            print(f"\n文件: {img_path.name}")
            print(f"置信度: {result.get('confidence', 0):.2f}")
            print(f"识别结果:\n{result.get('text', '')}")
            print("-" * 50)

        except Exception as e:
            logger.error(f"处理失败 {img_path}: {e}")
            results.append({
                'filename': img_path.name,
                'error': str(e),
                'text': ''
            })

    # 保存结果
    if results:
        save_results(results, args.output, args.format)

    return 0

if __name__ == '__main__':
    sys.exit(main())

gui.py - 图形界面(PyQt5)

#!/usr/bin/env python3
"""
OCR工具图形界面
"""

import sys
import os
from pathlib import Path
from PyQt5.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout, 
                             QHBoxLayout, QPushButton, QTextEdit, QLabel, 
                             QFileDialog, QComboBox, QProgressBar, QMessageBox,
                             QSplitter, QGroupBox, QCheckBox, QSpinBox, QTabWidget)
from PyQt5.QtCore import Qt, QThread, pyqtSignal
from PyQt5.QtGui import QFont, QPixmap, QImage
import cv2
import json
from datetime import datetime
from utils.ocr_engine import OCREngine
from utils.image_processor import ImageProcessor

class OCRWorker(QThread):
    """OCR工作线程"""
    progress = pyqtSignal(int)
    result_ready = pyqtSignal(dict)
    finished = pyqtSignal()
    error = pyqtSignal(str)

    def __init__(self, image_paths, language, engine, preprocess, deskew):
        super().__init__()
        self.image_paths = image_paths
        self.language = language
        self.engine = engine
        self.preprocess = preprocess
        self.deskew = deskew
        self.canceled = False

    def run(self):
        try:
            ocr = OCREngine(lang=self.language, engine=self.engine)
            processor = ImageProcessor()

            for i, img_path in enumerate(self.image_paths):
                if self.canceled:
                    break

                try:
                    self.progress.emit(int((i / len(self.image_paths)) * 100))

                    # 读取并处理图片
                    image = cv2.imread(str(img_path))
                    if image is None:
                        continue

                    if self.preprocess:
                        image = processor.enhance_image(image)

                    if self.deskew:
                        image = processor.deskew_image(image)

                    image = processor.resize_image(image)

                    # OCR识别
                    result = ocr.recognize(image)
                    result['filename'] = str(img_path.name)
                    self.result_ready.emit(result)

                except Exception as e:
                    self.result_ready.emit({
                        'filename': str(img_path.name),
                        'error': str(e),
                        'text': ''
                    })

            self.progress.emit(100)

        except Exception as e:
            self.error.emit(str(e))
        finally:
            self.finished.emit()

    def cancel(self):
        self.canceled = True

class OCREditor(QMainWindow):
    """OCR编辑器主窗口"""

    def __init__(self):
        super().__init__()
        self.image_paths = []
        self.current_image = None
        self.worker = None
        self.init_ui()

    def init_ui(self):
        self.setWindowTitle('OCR图片识别工具')
        self.setGeometry(100, 100, 1200, 800)

        # 创建中心部件
        central_widget = QWidget()
        self.setCentralWidget(central_widget)

        # 主布局
        main_layout = QVBoxLayout(central_widget)

        # 工具栏
        toolbar = self.create_toolbar()
        main_layout.addLayout(toolbar)

        # 分割器
        splitter = QSplitter(Qt.Horizontal)

        # 左侧图片区域
        left_widget = QWidget()
        left_layout = QVBoxLayout(left_widget)

        self.image_label = QLabel('点击"选择图片"加载图片')
        self.image_label.setAlignment(Qt.AlignCenter)
        self.image_label.setStyleSheet('border: 1px solid #ccc;')
        self.image_label.setMinimumSize(400, 300)
        left_layout.addWidget(self.image_label)

        # 图片信息
        info_group = QGroupBox('图片信息')
        info_layout = QVBoxLayout()
        self.info_text = QLabel('')
        info_layout.addWidget(self.info_text)
        info_group.setLayout(info_layout)
        left_layout.addWidget(info_group)

        splitter.addWidget(left_widget)

        # 右侧结果区域
        right_widget = QWidget()
        right_layout = QVBoxLayout(right_widget)

        # 标签页
        self.tab_widget = QTabWidget()

        # 文本结果标签页
        text_tab = QWidget()
        text_layout = QVBoxLayout(text_tab)
        self.result_text = QTextEdit()
        self.result_text.setFont(QFont('Consolas', 10))
        text_layout.addWidget(self.result_text)
        self.tab_widget.addTab(text_tab, '文本结果')

        # JSON结果标签页
        json_tab = QWidget()
        json_layout = QVBoxLayout(json_tab)
        self.json_text = QTextEdit()
        self.json_text.setFont(QFont('Consolas', 10))
        json_layout.addWidget(self.json_text)
        self.tab_widget.addTab(json_tab, 'JSON视图')

        right_layout.addWidget(self.tab_widget)

        # 统计信息
        stats_group = QGroupBox('识别统计')
        stats_layout = QHBoxLayout()
        self.confidence_label = QLabel('置信度: --')
        self.language_label = QLabel('语言: --')
        self.engine_label = QLabel('引擎: --')
        stats_layout.addWidget(self.confidence_label)
        stats_layout.addWidget(self.language_label)
        stats_layout.addWidget(self.engine_label)
        stats_group.setLayout(stats_layout)
        right_layout.addWidget(stats_group)

        splitter.addWidget(right_widget)

        # 设置分割器比例
        splitter.setSizes([400, 800])
        main_layout.addWidget(splitter)

        # 进度条
        self.progress_bar = QProgressBar()
        self.progress_bar.setVisible(False)
        main_layout.addWidget(self.progress_bar)

    def create_toolbar(self):
        toolbar = QHBoxLayout()

        # 选择图片按钮
        self.select_btn = QPushButton('选择图片')
        self.select_btn.clicked.connect(self.select_images)
        toolbar.addWidget(self.select_btn)

        # 选择文件夹按钮
        self.select_dir_btn = QPushButton('选择文件夹')
        self.select_dir_btn.clicked.connect(self.select_directory)
        toolbar.addWidget(self.select_dir_btn)

        # 语言选择
        toolbar.addWidget(QLabel('语言:'))
        self.lang_combo = QComboBox()
        self.lang_combo.addItems(['英文', '中文', '英文+中文', '日语', '韩语'])
        self.lang_combo.setCurrentIndex(2)
        toolbar.addWidget(self.lang_combo)

        # OCR引擎选择
        toolbar.addWidget(QLabel('引擎:'))
        self.engine_combo = QComboBox()
        self.engine_combo.addItems(['Tesseract', 'PaddleOCR'])
        toolbar.addWidget(self.engine_combo)

        # 预处理选项
        self.preprocess_check = QCheckBox('图像预处理')
        self.preprocess_check.setChecked(True)
        toolbar.addWidget(self.preprocess_check)

        self.deskew_check = QCheckBox('自动纠偏')
        toolbar.addWidget(self.deskew_check)

        # 开始识别按钮
        self.start_btn = QPushButton('开始识别')
        self.start_btn.clicked.connect(self.start_ocr)
        self.start_btn.setStyleSheet('background-color: #4CAF50; color: white;')
        toolbar.addWidget(self.start_btn)

        # 停止按钮
        self.stop_btn = QPushButton('停止')
        self.stop_btn.clicked.connect(self.stop_ocr)
        self.stop_btn.setEnabled(False)
        toolbar.addWidget(self.stop_btn)

        # 保存按钮
        self.save_btn = QPushButton('保存结果')
        self.save_btn.clicked.connect(self.save_results)
        toolbar.addWidget(self.save_btn)

        toolbar.addStretch()

        return toolbar

    def select_images(self):
        """选择图片文件"""
        files, _ = QFileDialog.getOpenFileNames(
            self, '选择图片文件', '',
            '图片文件 (*.png *.jpg *.jpeg *.bmp *.tiff)'
        )

        if files:
            self.image_paths = [Path(f) for f in files]
            self.load_first_image()

    def select_directory(self):
        """选择文件夹"""
        directory = QFileDialog.getExistingDirectory(self, '选择文件夹')

        if directory:
            image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.bmp', '*.tiff']
            self.image_paths = []
            for ext in image_extensions:
                self.image_paths.extend(Path(directory).glob(ext))

            if self.image_paths:
                self.load_first_image()

    def load_first_image(self):
        """加载第一张图片预览"""
        if self.image_paths:
            try:
                pixmap = QPixmap(str(self.image_paths[0]))
                scaled_pixmap = pixmap.scaled(
                    self.image_label.size(), 
                    Qt.KeepAspectRatio, 
                    Qt.SmoothTransformation
                )
                self.image_label.setPixmap(scaled_pixmap)

                # 显示图片信息
                info = f"文件: {self.image_paths[0].name}\n"
                info += f"大小: {pixmap.width()} x {pixmap.height()}\n"
                info += f"格式: {self.image_paths[0].suffix}\n"
                info += f"总数: {len(self.image_paths)} 张图片"
                self.info_text.setText(info)

            except Exception as e:
                QMessageBox.warning(self, '错误', f'加载图片失败: {e}')

    def start_ocr(self):
        """开始OCR识别"""
        if not self.image_paths:
            QMessageBox.warning(self, '警告', '请先选择图片!')
            return

        # 禁用UI
        self.select_btn.setEnabled(False)
        self.select_dir_btn.setEnabled(False)
        self.start_btn.setEnabled(False)
        self.stop_btn.setEnabled(True)

        # 清空结果
        self.result_text.clear()
        self.json_text.clear()

        # 获取参数
        lang_map = {
            '英文': 'eng',
            '中文': 'chi_sim',
            '英文+中文': 'eng+chi_sim',
            '日语': 'jpn',
            '韩语': 'kor'
        }
        language = lang_map.get(self.lang_combo.currentText(), 'eng+chi_sim')
        engine = 'tesseract' if self.engine_combo.currentText() == 'Tesseract' else 'paddle'

        # 创建工作线程
        self.worker = OCRWorker(
            self.image_paths,
            language,
            engine,
            self.preprocess_check.isChecked(),
            self.deskew_check.isChecked()
        )

        # 连接信号
        self.worker.progress.connect(self.update_progress)
        self.worker.result_ready.connect(self.on_result_ready)
        self.worker.finished.connect(self.on_ocr_finished)
        self.worker.error.connect(self.on_ocr_error)

        # 显示进度条
        self.progress_bar.setVisible(True)
        self.progress_bar.setValue(0)

        # 启动线程
        self.worker.start()

    def stop_ocr(self):
        """停止OCR识别"""
        if self.worker:
            self.worker.cancel()
            self.worker.wait()
            self.on_ocr_finished()

    def update_progress(self, value):
        """更新进度条"""
        self.progress_bar.setValue(value)

    def on_result_ready(self, result):
        """处理OCR结果"""
        if 'error' in result:
            text = f"错误: {result['error']}"
        else:
            text = result.get('text', '')
            confidence = result.get('confidence', 0)

            # 更新统计信息
            self.confidence_label.setText(f'置信度: {confidence:.2%}')
            self.language_label.setText(f'语言: {result.get("language", "")}')
            self.engine_label.setText(f'引擎: {result.get("engine", "").capitalize()}')

        # 添加到文本结果
        self.result_text.append(f"=== {result.get('filename', 'Unknown')} ===\n")
        self.result_text.append(text)
        self.result_text.append("\n")

        # 更新JSON视图
        self.json_text.setPlainText(json.dumps(result, ensure_ascii=False, indent=2))

    def on_ocr_finished(self):
        """OCR完成"""
        self.select_btn.setEnabled(True)
        self.select_dir_btn.setEnabled(True)
        self.start_btn.setEnabled(True)
        self.stop_btn.setEnabled(False)
        self.progress_bar.setVisible(False)
        self.worker = None

        QMessageBox.information(self, '完成', 'OCR识别完成!')

    def on_ocr_error(self, error_msg):
        """OCR出错"""
        QMessageBox.critical(self, '错误', f'OCR识别出错:\n{error_msg}')
        self.on_ocr_finished()

    def save_results(self):
        """保存结果到文件"""
        if not self.result_text.toPlainText().strip():
            QMessageBox.warning(self, '警告', '没有可保存的结果!')
            return

        file_path, _ = QFileDialog.getSaveFileName(
            self, '保存结果', 
            f'ocr_result_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt',
            '文本文件 (*.txt);;JSON文件 (*.json)'
        )

        if file_path:
            try:
                with open(file_path, 'w', encoding='utf-8') as f:
                    if file_path.endswith('.json'):
                        # 尝试解析结果
                        try:
                            data = json.loads(self.json_text.toPlainText())
                            json.dump(data, f, ensure_ascii=False, indent=2)
                        except:
                            f.write(self.json_text.toPlainText())
                    else:
                        f.write(self.result_text.toPlainText())

                QMessageBox.information(self, '成功', f'结果已保存到:\n{file_path}')

            except Exception as e:
                QMessageBox.critical(self, '错误', f'保存失败: {e}')

def main():
    app = QApplication(sys.argv)
    app.setStyle('Fusion')  # 使用Fusion样式以获得更好的跨平台体验

    # 设置应用信息
    app.setApplicationName('OCR图片识别工具')
    app.setOrganizationName('OCR Tool')

    editor = OCREditor()
    editor.show()

    sys.exit(app.exec_())

if __name__ == '__main__':
    main()

3. 使用说明

命令行使用

# 单张图片识别
python cli.py image.jpg

# 批量识别
python cli.py folder_path --batch

# 指定语言和引擎
python cli.py image.jpg -l chi_sim -e paddle

# 启用预处理
python cli.py image.jpg --preprocess --deskew

# 保存为JSON格式
python cli.py image.jpg -f json -o ./results

图形界面使用

# 启动GUI
python main.py
# 或直接运行
python gui.py

4. 功能特点

跨平台支持:兼容Windows和Linux系统 多引擎支持:支持Tesseract和PaddleOCR引擎 多语言识别:支持中英文等多种语言 图像预处理:提供图像增强、纠偏等功能 批量处理:支持批量处理多张图片 双界面:提供命令行和图形界面两种使用方式 结果导出:支持TXT和JSON格式导出

5. 扩展建议

添加API接口:可以添加Flask或FastAPI接口,提供Web服务 支持更多格式:添加对PDF、Word等文档的支持 添加翻译功能:集成翻译API,支持多语言翻译 添加数据库支持:保存识别历史记录 添加插件系统:支持扩展不同的OCR引擎

这个OCR工具完全跨平台,只需安装相应的依赖即可在Windows和Linux上运行。您可以根据需要调整配置和功能。

相关推荐