36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
"""Read printed text out of an image — what people usually call OCR.
|
|
|
|
Traditional OCR (Tesseract, ABBYY, etc.) is a dedicated system trained on
|
|
piles of text-image pairs and uses character-level segmentation. We do
|
|
the same job here by asking a general-purpose vision-language model to
|
|
just *read out loud* what it sees.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
from ai_function import ask
|
|
|
|
DEFAULT_IMAGE = Path(__file__).resolve().parent.parent / "sample_images" / "text_notice.jpg"
|
|
|
|
|
|
def ai_ocr(image_path: str | Path) -> str:
|
|
return ask(
|
|
"Read all the text visible in this image, exactly as written, preserving "
|
|
"line breaks. Output only the text — no commentary, no quotes, no "
|
|
"summary, no description of the image.",
|
|
image=image_path,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--image", type=Path, default=DEFAULT_IMAGE)
|
|
args = parser.parse_args()
|
|
|
|
print(f"Image: {args.image}\n")
|
|
print("--- transcribed text ---")
|
|
print(ai_ocr(args.image))
|