Wednesday, October 8, 2025

Use Case#1 - Form Filler

 let’s now make this a mini production-ready system with:

๐Ÿ“ฆ SQLite database
๐Ÿงพ List of all past submissions
๐Ÿ”— Downloadable PDF for each record
๐Ÿง  OCR + Ollama data extraction workflow preserved


๐Ÿงฉ Final Architecture Overview

form_filler/ │ ├── app.py ├── templates/ │ ├── upload.html │ ├── form.html │ └── submissions.html └── static/ ├── uploads/ └── pdfs/

⚙️ Step 1: Install Dependencies

pip install flask pytesseract pillow pdfplumber reportlab

SQLite is built-in with Python.

Make sure:

ollama serve ollama pull llama3.2:latest

๐Ÿง  Step 2: Complete app.py

from flask import Flask, render_template, request, send_from_directory, redirect, url_for
import pytesseract
from PIL import Image
import subprocess, json, os, sqlite3
import pdfplumber
from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from datetime import datetime

app = Flask(__name__)
#UPLOAD_FOLDER = 'static/uploads'
#PDF_FOLDER = 'static/pdfs'
DB_FILE = 'submissions.db'
# os.makedirs(UPLOAD_FOLDER, exist_ok=True)
# os.makedirs(PDF_FOLDER, exist_ok=True)
import os
from flask import Flask, render_template, request, send_from_directory

app = Flask(__name__)

# Create safe absolute paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
UPLOAD_FOLDER = os.path.join(BASE_DIR, 'static', 'uploads')
PDF_FOLDER = os.path.join(BASE_DIR, 'static', 'pdfs')

# Ensure folders exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(PDF_FOLDER, exist_ok=True)

app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
app.config['PDF_FOLDER'] = PDF_FOLDER

# ------------------------------
# DATABASE SETUP
# ------------------------------
def init_db():
    conn = sqlite3.connect(DB_FILE)
    cur = conn.cursor()
    cur.execute("""
    CREATE TABLE IF NOT EXISTS submissions (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT,
        date_of_birth TEXT,
        address TEXT,
        college_name TEXT,
        course_applied TEXT,
        marks TEXT,
        pdf_file TEXT,
        created_at TEXT
    )
    """)
    conn.commit()
    conn.close()

def save_to_db(data, pdf_file):
    conn = sqlite3.connect(DB_FILE)
    cur = conn.cursor()
    cur.execute("""
        INSERT INTO submissions (name, date_of_birth, address, college_name, course_applied, marks, pdf_file, created_at)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    """, (
        data.get('name', ''),
        data.get('date_of_birth', ''),
        data.get('address', ''),
        data.get('college_name', ''),
        data.get('course_applied', ''),
        data.get('marks', ''),
        pdf_file,
        datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    ))
    conn.commit()
    conn.close()

def fetch_all_submissions():
    conn = sqlite3.connect(DB_FILE)
    cur = conn.cursor()
    cur.execute("SELECT id, name, college_name, course_applied, created_at, pdf_file FROM submissions ORDER BY id DESC")
    rows = cur.fetchall()
    conn.close()
    return rows

# ------------------------------
# OCR + OLLAMA UTILITIES
# ------------------------------
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text.strip()

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
    if ext in ['.jpg', '.jpeg', '.png']:
        return pytesseract.image_to_string(Image.open("sample_form.jpg"))
    elif ext == '.pdf':
        return extract_text_from_pdf(file_path)
    else:
        return ""

def generate_pdf(data, output_path):
    doc = SimpleDocTemplate(output_path, pagesize=A4)
    styles = getSampleStyleSheet()
    elements = []

    elements.append(Paragraph("<b>Admission Form Summary</b>", styles['Title']))
    elements.append(Spacer(1, 20))

    for k, v in data.items():
        if k not in ["submit"]:
            elements.append(Paragraph(f"<b>{k.replace('_', ' ').title()}:</b> {v}", styles['Normal']))
            elements.append(Spacer(1, 10))

    doc.build(elements)

# ------------------------------
# ROUTES
# ------------------------------
@app.route('/')
def upload_page():
    return render_template('upload.html')

@app.route('/process', methods=['POST'])
def process_image():
    if 'file' not in request.files:
        return "No file uploaded."
   
    file = request.files['file']
    if file.filename == '':
        return "No file selected."

    filepath = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) # type: ignore
    file.save(filepath)


    text = extract_text(filepath)
    if not text.strip():
        return "No text could be extracted. Try a clearer image or text-based PDF."

    prompt = f"""
    You are a data extraction assistant.
    Extract and return as JSON the following fields if present:
    - name
    - date_of_birth
    - address
    - college_name
    - course_applied
    - marks
    Text:
    {text}
    """

    try:
        result = subprocess.run(
            ["ollama", "run", "llama3.2:latest"],
            input=prompt.encode("utf-8"),
            capture_output=True,
            timeout=90
        )
        response = result.stdout.decode().strip()
        start = response.find('{')
        end = response.rfind('}') + 1
        json_str = response[start:end] if start != -1 and end != -1 else '{}'
        data = json.loads(json_str)
    except Exception as e:
        data = {"error": f"Model error: {str(e)}"}

    return render_template('form.html', data=data, image=file.filename)

@app.route('/submit', methods=['POST'])
def submit_form():
    data = request.form.to_dict()
    pdf_filename = f"filled_form_{data.get('name', 'anonymous').replace(' ', '_')}.pdf"
    pdf_path = os.path.join(app.config['PDF_FOLDER'], pdf_filename)
    generate_pdf(data, pdf_path)
    save_to_db(data, pdf_filename)
    return redirect(url_for('view_submissions'))

@app.route('/submissions')
def view_submissions():
    rows = fetch_all_submissions()
    return render_template('submissions.html', submissions=rows)

@app.route('/download/<filename>')
def download_pdf(filename):
    return send_from_directory(app.config['PDF_FOLDER'], filename, as_attachment=True)

if __name__ == "__main__":
    init_db()
    app.run(debug=True)

๐Ÿงพ templates/upload.html

<!DOCTYPE html> <html> <head> <title>Upload Document</title> <style> body { font-family: Arial; background: #eef; padding: 40px; } .container { background: #fff; padding: 30px; border-radius: 10px; width: 400px; margin: auto; box-shadow: 0 0 10px #aaa; } </style> </head> <body> <div class="container"> <h2>Upload Document (Image or PDF)</h2> <form action="{{ url_for('process_image') }}" method="POST" enctype="multipart/form-data"> <input type="file" name="file" accept=".jpg,.jpeg,.png,.pdf" required><br><br> <button type="submit">Process</button> </form> <hr> <a href="{{ url_for('view_submissions') }}">๐Ÿ“œ View All Submissions</a> </div> </body> </html>

๐Ÿงฎ templates/form.html

<!DOCTYPE html> <html> <head> <title>Form Preview</title> <style> body { font-family: Arial; background: #eef; padding: 40px; } .container { background: #fff; padding: 30px; border-radius: 10px; width: 600px; margin: auto; box-shadow: 0 0 10px #aaa; } input, textarea { width: 100%; padding: 8px; margin: 8px 0; } </style> </head> <body> <div class="container"> <h2>Auto-Filled Admission Form</h2> <form method="POST" action="{{ url_for('submit_form') }}"> <label>Name:</label> <input type="text" name="name" value="{{ data.get('name', '') }}"> <label>Date of Birth:</label> <input type="text" name="date_of_birth" value="{{ data.get('date_of_birth', '') }}"> <label>Address:</label> <textarea name="address">{{ data.get('address', '') }}</textarea> <label>College Name:</label> <input type="text" name="college_name" value="{{ data.get('college_name', '') }}"> <label>Course Applied:</label> <input type="text" name="course_applied" value="{{ data.get('course_applied', '') }}"> <label>Marks:</label> <input type="text" name="marks" value="{{ data.get('marks', '') }}"> <button type="submit">Submit & Save</button> </form> </div> </body> </html>

๐Ÿ—‚️ templates/submissions.html

<!DOCTYPE html> <html> <head> <title>All Submissions</title> <style> body { font-family: Arial; background: #eef; padding: 40px; } table { border-collapse: collapse; width: 90%; margin: auto; background: #fff; } th, td { border: 1px solid #ccc; padding: 10px; text-align: left; } th { background: #ddd; } a { text-decoration: none; color: blue; } </style> </head> <body> <h2 align="center">๐Ÿ“œ All Submitted Admission Forms</h2> <table> <tr> <th>ID</th> <th>Name</th> <th>College</th> <th>Course</th> <th>Date</th> <th>PDF</th> </tr> {% for s in submissions %} <tr> <td>{{ s[0] }}</td> <td>{{ s[1] }}</td> <td>{{ s[2] }}</td> <td>{{ s[3] }}</td> <td>{{ s[4] }}</td> <td><a href="{{ url_for('download_pdf', filename=s[5]) }}">Download</a></td> </tr> {% endfor %} </table> <div align="center" style="margin-top:20px;"> <a href="{{ url_for('upload_page') }}">⬅️ Back to Upload</a> </div> </body> </html>

๐Ÿš€ Run & Test

python app.py

Open browser → http://localhost:5000

✅ Features

  • Upload scanned image or PDF

  • Extracts data with OCR + Ollama

  • Auto-fills editable web form

  • Generates PDF of final data

  • Saves record to SQLite DB

  • View all records at /submissions with download links


Would you like me to add Admin login protection for the /submissions page next (so only authorized users can view/download records)?            


How you can create and test with real sample image/PDF files for your OCR + Ollama web app.

Since I can’t attach files directly here, I’ll give you ready-to-generate content you can copy-paste and use locally.


๐Ÿ–ผ️ SAMPLE IMAGE (Admission Form – Image)

1️⃣ Open any text editor (e.g., Notepad).
2️⃣ Copy this content and save it as sample_form.txt:

Name: Anitha R
Date of Birth: 23-05-2003
Address: 12, Gandhi Street, Madurai
College Name: PSG College of Arts and Science
Course Applied: B.Sc Computer Science
Marks: 472/500

3️⃣ Now convert this text file into an image:

Option A: Use Python (quick)

from PIL import Image, ImageDraw, ImageFont

text = """Name: Anitha R
Date of Birth: 23-05-2003
Address: 12, Gandhi Street, Madurai
College Name: PSG College of Arts and Science
Course Applied: B.Sc Computer Science
Marks: 472/500"""

img = Image.new("RGB", (800, 400), color="white")
d = ImageDraw.Draw(img)
d.text((30, 30), text, fill="black")
img.save("sample_form.jpg")
print("Saved sample_form.jpg")

Run this once; it will create sample_form.jpg — upload it in your Flask app’s upload page.


๐Ÿ“„ SAMPLE PDF (Admission Form – PDF)

Option A: Generate using Python

from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

c = canvas.Canvas("sample_form.pdf", pagesize=A4)
text = """Name: Arjun M
Date of Birth: 15-11-2002
Address: 45, Rajaji Salai, Chennai
College Name: Loyola College
Course Applied: M.Sc Data Science
Marks: 489/500"""

c.setFont("Helvetica", 12)
x, y = 80, 750
for line in text.split("\n"):
    c.drawString(x, y, line)
    y -= 20
c.save()
print("Saved sample_form.pdf")

After running this, you’ll have sample_form.pdf in your working directory.

Upload this in your app — the OCR + LLM pipeline should extract data automatically.


✅ QUICK TEST PLAN

Step Input Expected Output
1 Upload sample_form.jpg Text auto-extracted (“Anitha R”, “PSG College …”)
2 Upload sample_form.pdf Text auto-extracted (“Arjun M”, “Loyola College”)
3 Edit fields if needed Submit
4 Go to /submissions Entries listed with download links
5 Download PDF Check that it matches entered data

------------------------

Some time error will come to read OCR. May be because of with out path spec. That error means your Python code is correct, but the Tesseract OCR engine itself isn’t installed or not found in your system path.

Let’s fix it step-by-step depending on your OS ๐Ÿ‘‡


๐ŸชŸ If you’re on Windows

1️⃣ Install Tesseract OCR

Download the Windows installer from the official source:
๐Ÿ‘‰ https://github.com/UB-Mannheim/tesseract/wiki

Pick the latest version (for example, tesseract-ocr-w64-setup-5.x.x.exe).
Run the installer and let it install to the default path, e.g.:

C:\Program Files\Tesseract-OCR\

2️⃣ Add to System PATH

  • Open System Properties → Advanced → Environment Variables

  • Under System variables, find Path, click Edit

  • Add this entry:

    C:\Program Files\Tesseract-OCR\
    
  • Click OK everywhere.

3️⃣ Verify installation

Open a new Command Prompt and run:

tesseract --version

If it prints version info (like tesseract v5.3.3.0), it’s good.

4️⃣ Update your Python code (optional safety net)

Add this line before calling pytesseract.image_to_string:

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

Full example:

import pytesseract
from PIL import Image

# Explicit path on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

text = pytesseract.image_to_string(Image.open("sample_form.jpg"))
print(text)

๐Ÿง If you’re on Linux

Install via apt:

sudo apt update
sudo apt install tesseract-ocr -y

Verify:

tesseract --version

Then you’re done — no path setting needed.


๐Ÿ If you’re on macOS

Using Homebrew:

brew install tesseract

Verify:

tesseract --version

Then run your Flask app again — it’ll work immediately.


✅ After installation

Restart your Flask app:

python app.py

Upload the sample image (sample_form.jpg) — you should now see extracted text on screen, no Tesseract errors ๐ŸŽฏ


Successful Output : Check in http://localhost:5000/submissions as shown below:


Use Case 1 Successfully Implemented!!!
 


Use Case#1 - Form Filler

 let’s now make this a mini production-ready system with: ๐Ÿ“ฆ SQLite database ๐Ÿงพ List of all past submissions ๐Ÿ”— Downloadable PDF for eac...