python3.7+
pip install pytesseract==0.1.9
安装tesseract-ocr(配置path环境变量或者在代码中指定tesseract_cmd)
import pytesseract# 如果未在PATH中配置tesseract环境变量,则需要手动设置tesseract可执行文件的全路径
sseract_cmd = r'D:'
# Example tesseract_cmd = r'C:Program Files (x86)Tesseract-OCRtesseract'
import osimport pytesseract# 语言文件.traindata
os.environ['TESSDATA_PREFIX'] = 'D:\tesseract\tessdata' # 在PATH中配置tesseract环境变量或者此处指定# 如果未在PATH中配置tesseract环境变量,则需要手动设置tesseract可执行文件的全路径
sseract_cmd = r'D:\tesseract\'
直接执行命令行ok,pytesseract.image_to_osd(Image.open(‘test.png’))) 报错
tesseract E:matpy-demo-22extreme_pointsimagesnormal.jpg E:matpy-demo-22extreme_pointsimagesnormal_1 -l osd --psm 0
报错:tesseract升级的bug
解决:直接传递照片文件路径,或者切回低版本的tesseract:5.0.0-alpha-20201224 ok。详情可参考
# 获取方向和文本系统
# print(pytesseract.image_to_osd(Image.open('test.png'))) # happening on tesseract version released after Jan1, 2021.I have tested in tesseract version 5.0.0-alpha-20201224
print(pytesseract.image_to_osd('test.png'))
# 测试pytesseract的方法
# test_osd.py
import osimport pytesseract
from PIL import Image# 语言文件.traindata
os.environ['TESSDATA_PREFIX'] = 'D:\tesseract\tessdata' # 在PATH中配置tesseract环境变量或者此处指定# 如果未在PATH中配置tesseract环境变量,则需要手动设置tesseract可执行文件的全路径
sseract_cmd = r'D:\tesseract\'# 图片转string
print("PIL image_to_string: ", pytesseract.image_to_string(Image.open('test.png')))# 提供图片的相对或者绝对路径
print('image_to_string: ', pytesseract.image_to_string('test.png'))# 打印pytesseract支持的所有语言
print('langs: ', _languages(config=''))# French语言的识别
print('fra image_to_string: ',pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))# 批量处理单个文件list
print('batch Images image_to_string: ',pytesseract.image_to_string('))# 超时后结束pyteseract
try:print(pytesseract.image_to_string('test.jpg', timeout=2)) # Timeout after 2 secondsprint(pytesseract.image_to_string('test.jpg', timeout=0.5)) # Timeout after half a second
except RuntimeError as timeout_error:# Tesseract processing is terminatedpass# 获取预测的边界框
# print(pytesseract.image_to_boxes(Image.open('test.png')))
print('image_to_boxes: ',pytesseract.image_to_boxes('test.png'))# 获取详细数据,包括方框、置信度、行号和页码
print('image_to_data: ',pytesseract.image_to_data(Image.open('test.png')))# 获取方向和文本系统
# print(pytesseract.image_to_osd(Image.open('test.png'))) # but this issue is happening on tesseract version released after Jan1, 2021.I have tested in tesseract version 5.0.0-alpha-20201224
print('image_to_osd: ',pytesseract.image_to_osd('test.png'))# Get a searchable PDF
# pdf = pytesseract.image_to_pdf_or_hocr('test.png', extension='pdf') # 报错
# with open('test.pdf', 'w+b') as f:
# f.write(pdf) # pdf type is bytes by default# Get HOCR output
# hocr = pytesseract.image_to_pdf_or_hocr('test.png', extension='hocr') # error tesseract v5.2.0.20220712 # Get ALTO XML output
xml = pytesseract.image_to_alto_xml('test.png')
print('image_to_alto_xml: ', xml)
本文发布于:2024-02-03 00:19:06,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170689074647414.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |