tesseract简单试用-526互联

目的：通过截图获取多语言文本，与多语言文档对比，确定文本是否正确

ocr.py

截图可以是1.单个文件；2.adb截图；3.目录下所有图片文件

import image_process
import tesseract_process
import book_process
import os

option=int(input("1=file,2=adb,3=directory:"))
strings=[]
book_file=
book=book_process.book(book_file)
book.read()
if(option==1):
    image_file=
    language=image_file.split("\\")[-1].split(".")[0]
    image=image_process.image_picker.get_image_by_path(image_file)
    strings=tesseract_process.text_recognition.get_text(image,language)
    book.record(strings,image_file)
elif(option==2):
    language=input("language:")
    while(True):
        image,image_file=image_process.image_picker.get_image_by_adb(language)
        strings=tesseract_process.text_recognition.get_text(image,language)
        book.record(strings,image_file)
        flag=input("input n to stop or enter to continue")
        if(flag=="n"):
            break
elif(option==3):
    dir=
    image_set=image_process.image_picker.get_image_from_dir(dir)
    for image_file in image_set:
        language=image_file.split(".")[0]
        image_file=os.path.join(dir,image_file)
        print(image_file)
        image=image_process.image_picker.get_image_by_path(image_file)
        strings=tesseract_process.text_recognition.get_text(image,language)
        book.record(strings,image_file)
book.save()

image_process.py

处理获取图片的逻辑，为tesseract返回Pillow的image对象

from PIL import Image
import os
from datetime import datetime

class image_picker(object):
    def get_image_by_path(file):
        image=Image.open(file)
        return image

    def get_image_by_adb(language):
        timestamp=str(int(datetime.now().timestamp()))
        image_file=language+"."+timestamp+".png"
        command="adb shell screencap -p /sdcard/"+image_file
        os.system(command)
        command="adb pull /sdcard/"+image_file+" ./"
        os.system(command)
        command="adb shell rm /sdcard/"+image_file
        os.system(command)
        file="./"+image_file
        image=Image.open(file)
        return image,image_file

    def get_image_from_dir(dir):
        types=("png","jpg","jpeg")
        image_set=set()
        for a,b,files in os.walk(dir):
            for file in files:
                if(file.split(".")[-1] in types):
                    image_set.add(file)
        return image_set

tesseract_process.py

使用tesseract获取图片上的文本，使用两个空格作为分隔符，返回单词的列表

import pytesseract
import re
import image_process

class text_recognition(object):
    def get_text(image,lang):
        text=pytesseract.image_to_string(image,lang=lang,config="--psm 3 -c preserve_interword_spaces=1")
        result=re.split(r"\n|\s{2,}",text)
        return result

book_process.py

多语言文档储存在xlsx文件，A列为给定的文本，B列为对比结果，C列为发现文本的次数，D列为发现文本的图片文件

import openpyxl

class book(object):
    def __init__(self,file):
        self.__file=file
    def read(self):
        self.__book=openpyxl.load_workbook(self.__file)
        self.__sheet=self.__book["Sheet1"]
        rowidx=1
        for row in self.__sheet.iter_rows(min_col=3,max_col=3,values_only=True):
            for count in row:
                if(count>0):
                    print("init error")
                    self.__book=None
                    return None
        return self.__book
    def write(self,row,column,value):
        self.__sheet.cell(row,column,value)

    def record(self,strings,image_file):
        words={}
        rowidx=1
        for row in self.__sheet.iter_rows(max_col=1,values_only=True):
            for word in row:
                words[word]=rowidx
                # print(words[word],word)
            rowidx+=1
        for word in strings:
            if(word in words):
                rowidx=words[word]
                print("found %s at %d" % (word[:15],rowidx))
                resultcell=self.__sheet.cell(rowidx,2,"found")
                countcell=self.__sheet.cell(rowidx,3,self.__sheet.cell(rowidx,3).value+1)
                pathcell=self.__sheet.cell(rowidx,4,str(self.__sheet.cell(rowidx,4).value)+"\r"+image_file)

    def save(self):
        self.__book.save(self.__file)

tesseract

tesseract-ocr

tesseract-ocr tesseract ocr

tesseract accuracy improve how

tesseract-wasm tesseract fastify wasm

tesseract tesseract-wasm webassembly wasm

tesseract webassembly ocr js