Skip to main content

One post tagged with "OCR"

View All Tags

ยท 3 min read
Amith Koujalgi

This article helps you build a simple chat app using images having text. This uses Ollama to interact with models.

With this user interface you can:

  • set the Ollama host to you want to use
  • select models available on Ollama
  • upload image having text information
  • ask questions to the selected model
  • get streaming answers from the model

Install dependenciesโ€‹

pip install streamlit==1.31.0 ollama==0.1.6 easyocr

Streamlit app codeโ€‹

# pip install streamlit==1.31.0 ollama==0.1.6 easyocr
from typing import Generator

import easyocr
import ollama
import streamlit as st
from ollama import Client

title = 'Chat with images having text'

st.set_page_config(layout="wide", page_title=title)

st.title(title)


def extract_with_ocr(image_file):
def _convert_to_xywh(coords):
x_coords = [coord[0] for coord in coords]
y_coords = [coord[1] for coord in coords]

x_min = min(x_coords)
y_min = min(y_coords)
x_max = max(x_coords)
y_max = max(y_coords)

x = x_min
y = y_min
w = x_max - x_min
h = y_max - y_min

return {
"x": float(x),
"y": float(y),
"w": float(w),
"h": float(h),
}

data = []
reader = easyocr.Reader(['en'])
result = reader.readtext(image_file)
for item in result:
coordinates, text, score = item
data.append({
"text": text,
"coordinates": _convert_to_xywh(coordinates),
"confidence-score": score
})
return data


def ask_ollama(ollama_url: str, model_name: str, ocr_data: dict, messages: list) -> Generator:
ocr_text = ""

for data in ocr_data:
ocr_text = ocr_text + "\n" + data['text']

question = messages[-1]['content']
_prompt = f"""
Use this data and answer the user's questions appropriately and very precisely. Keep the answer to the point.
If you don't know the answer, promptly tell that you do not know the answer.

{ocr_text}

{question}
"""
# print(_prompt)
messages[-1]['content'] = _prompt

stream = Client(host=ollama_url).chat(
model=model_name,
messages=messages,
stream=True
)
for chunk in stream:
yield chunk['message']['content']


if "ollama_host" not in st.session_state:
st.session_state.ollama_host = ""
if "selected_model" not in st.session_state:
st.session_state.selected_model = ""
if "uploaded_image" not in st.session_state:
st.session_state.uploaded_image = ""
if "ocr_data" not in st.session_state:
st.session_state.ocr_data = {}
if "messages" not in st.session_state:
st.session_state.messages = []
with st.sidebar:
st.session_state.ollama_host = st.text_input("Ollama Host:", "http://localhost:11434")
st.session_state.selected_model = st.selectbox(
"Select the model:", [model["name"] for model in ollama.list()["models"]])
st.session_state.uploaded_image = st.file_uploader(
"Choose a file", accept_multiple_files=False,
type=["png", "jpg", "jpeg"]
)

if st.session_state.uploaded_image is not None:
st.session_state.messages = []
img_bytes_data = st.session_state.uploaded_image.getvalue()
st.session_state.ocr_data = extract_with_ocr(img_bytes_data)
st.image(img_bytes_data)
st.session_state.messages.append(
{
"role": "assistant",
"content": f"Uploaded file: `{st.session_state.uploaded_image.name}`. Now what would you like to know from this image?"
}
)
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
if prompt := st.chat_input("How can I help you?"):
st.session_state.messages.append({"role": "user", "content": prompt})
with st.chat_message("user"):
st.markdown(prompt)
with st.chat_message("assistant"):
response = st.write_stream(
ask_ollama(
st.session_state.ollama_host,
st.session_state.selected_model,
st.session_state.ocr_data,
st.session_state.messages
)
)
st.session_state.messages.append(
{"role": "assistant", "content": response}
)

Start the appโ€‹

streamlit run <your-file-name>.py