let’s now make this a mini production-ready system with:
๐ฆ SQLite database
๐งพ List of all past submissions
๐ Downloadable PDF for each record
๐ง OCR + Ollama data extraction workflow preserved
๐งฉ Final Architecture Overview
form_filler/
│
├── app.py
├── templates/
│ ├── upload.html
│ ├── form.html
│ └── submissions.html
└── static/
├── uploads/
└── pdfs/
⚙️ Step 1: Install Dependencies
pip install flask pytesseract pillow pdfplumber reportlab
SQLite is built-in with Python.
Make sure:
ollama serve ollama pull llama3.2:latest
๐ง Step 2: Complete app.py
from flask import Flask, render_template, request, send_from_directory, redirect, url_forimport pytesseractfrom PIL import Imageimport subprocess, json, os, sqlite3import pdfplumberfrom reportlab.lib.pagesizes import A4from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacerfrom reportlab.lib.styles import getSampleStyleSheetfrom datetime import datetimeapp = Flask(__name__)#UPLOAD_FOLDER = 'static/uploads'#PDF_FOLDER = 'static/pdfs'DB_FILE = 'submissions.db'# os.makedirs(UPLOAD_FOLDER, exist_ok=True)# os.makedirs(PDF_FOLDER, exist_ok=True)import osfrom flask import Flask, render_template, request, send_from_directoryapp = Flask(__name__)# Create safe absolute pathsBASE_DIR = os.path.dirname(os.path.abspath(__file__))UPLOAD_FOLDER = os.path.join(BASE_DIR, 'static', 'uploads')PDF_FOLDER = os.path.join(BASE_DIR, 'static', 'pdfs')# Ensure folders existos.makedirs(UPLOAD_FOLDER, exist_ok=True)os.makedirs(PDF_FOLDER, exist_ok=True)app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDERapp.config['PDF_FOLDER'] = PDF_FOLDER# ------------------------------# DATABASE SETUP# ------------------------------def init_db():conn = sqlite3.connect(DB_FILE)cur = conn.cursor()cur.execute("""CREATE TABLE IF NOT EXISTS submissions (id INTEGER PRIMARY KEY AUTOINCREMENT,name TEXT,date_of_birth TEXT,address TEXT,college_name TEXT,course_applied TEXT,marks TEXT,pdf_file TEXT,created_at TEXT)""")conn.commit()conn.close()def save_to_db(data, pdf_file):conn = sqlite3.connect(DB_FILE)cur = conn.cursor()cur.execute("""INSERT INTO submissions (name, date_of_birth, address, college_name, course_applied, marks, pdf_file, created_at)VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", (data.get('name', ''),data.get('date_of_birth', ''),data.get('address', ''),data.get('college_name', ''),data.get('course_applied', ''),data.get('marks', ''),pdf_file,datetime.now().strftime("%Y-%m-%d %H:%M:%S")))conn.commit()conn.close()def fetch_all_submissions():conn = sqlite3.connect(DB_FILE)cur = conn.cursor()cur.execute("SELECT id, name, college_name, course_applied, created_at, pdf_file FROM submissions ORDER BY id DESC")rows = cur.fetchall()conn.close()return rows# ------------------------------# OCR + OLLAMA UTILITIES# ------------------------------def extract_text_from_pdf(pdf_path):text = ""with pdfplumber.open(pdf_path) as pdf:for page in pdf.pages:text += page.extract_text() + "\n"return text.strip()def extract_text(file_path):ext = os.path.splitext(file_path)[1].lower()pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"if ext in ['.jpg', '.jpeg', '.png']:return pytesseract.image_to_string(Image.open("sample_form.jpg"))elif ext == '.pdf':return extract_text_from_pdf(file_path)else:return ""def generate_pdf(data, output_path):doc = SimpleDocTemplate(output_path, pagesize=A4)styles = getSampleStyleSheet()elements = []elements.append(Paragraph("<b>Admission Form Summary</b>", styles['Title']))elements.append(Spacer(1, 20))for k, v in data.items():if k not in ["submit"]:elements.append(Paragraph(f"<b>{k.replace('_', ' ').title()}:</b> {v}", styles['Normal']))elements.append(Spacer(1, 10))doc.build(elements)# ------------------------------# ROUTES# ------------------------------@app.route('/')def upload_page():return render_template('upload.html')@app.route('/process', methods=['POST'])def process_image():if 'file' not in request.files:return "No file uploaded."file = request.files['file']if file.filename == '':return "No file selected."filepath = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) # type: ignorefile.save(filepath)text = extract_text(filepath)if not text.strip():return "No text could be extracted. Try a clearer image or text-based PDF."prompt = f"""You are a data extraction assistant.Extract and return as JSON the following fields if present:- name- date_of_birth- address- college_name- course_applied- marksText:{text}"""try:result = subprocess.run(["ollama", "run", "llama3.2:latest"],input=prompt.encode("utf-8"),capture_output=True,timeout=90)response = result.stdout.decode().strip()start = response.find('{')end = response.rfind('}') + 1json_str = response[start:end] if start != -1 and end != -1 else '{}'data = json.loads(json_str)except Exception as e:data = {"error": f"Model error: {str(e)}"}return render_template('form.html', data=data, image=file.filename)@app.route('/submit', methods=['POST'])def submit_form():data = request.form.to_dict()pdf_filename = f"filled_form_{data.get('name', 'anonymous').replace(' ', '_')}.pdf"pdf_path = os.path.join(app.config['PDF_FOLDER'], pdf_filename)generate_pdf(data, pdf_path)save_to_db(data, pdf_filename)return redirect(url_for('view_submissions'))@app.route('/submissions')def view_submissions():rows = fetch_all_submissions()return render_template('submissions.html', submissions=rows)@app.route('/download/<filename>')def download_pdf(filename):return send_from_directory(app.config['PDF_FOLDER'], filename, as_attachment=True)if __name__ == "__main__":init_db()app.run(debug=True)
๐งพ templates/upload.html
<!DOCTYPE html>
<html>
<head>
<title>Upload Document</title>
<style>
body { font-family: Arial; background: #eef; padding: 40px; }
.container { background: #fff; padding: 30px; border-radius: 10px; width: 400px; margin: auto; box-shadow: 0 0 10px #aaa; }
</style>
</head>
<body>
<div class="container">
<h2>Upload Document (Image or PDF)</h2>
<form action="{{ url_for('process_image') }}" method="POST" enctype="multipart/form-data">
<input type="file" name="file" accept=".jpg,.jpeg,.png,.pdf" required><br><br>
<button type="submit">Process</button>
</form>
<hr>
<a href="{{ url_for('view_submissions') }}">๐ View All Submissions</a>
</div>
</body>
</html>
๐งฎ templates/form.html
<!DOCTYPE html>
<html>
<head>
<title>Form Preview</title>
<style>
body { font-family: Arial; background: #eef; padding: 40px; }
.container { background: #fff; padding: 30px; border-radius: 10px; width: 600px; margin: auto; box-shadow: 0 0 10px #aaa; }
input, textarea { width: 100%; padding: 8px; margin: 8px 0; }
</style>
</head>
<body>
<div class="container">
<h2>Auto-Filled Admission Form</h2>
<form method="POST" action="{{ url_for('submit_form') }}">
<label>Name:</label>
<input type="text" name="name" value="{{ data.get('name', '') }}">
<label>Date of Birth:</label>
<input type="text" name="date_of_birth" value="{{ data.get('date_of_birth', '') }}">
<label>Address:</label>
<textarea name="address">{{ data.get('address', '') }}</textarea>
<label>College Name:</label>
<input type="text" name="college_name" value="{{ data.get('college_name', '') }}">
<label>Course Applied:</label>
<input type="text" name="course_applied" value="{{ data.get('course_applied', '') }}">
<label>Marks:</label>
<input type="text" name="marks" value="{{ data.get('marks', '') }}">
<button type="submit">Submit & Save</button>
</form>
</div>
</body>
</html>
๐️ templates/submissions.html
<!DOCTYPE html>
<html>
<head>
<title>All Submissions</title>
<style>
body { font-family: Arial; background: #eef; padding: 40px; }
table { border-collapse: collapse; width: 90%; margin: auto; background: #fff; }
th, td { border: 1px solid #ccc; padding: 10px; text-align: left; }
th { background: #ddd; }
a { text-decoration: none; color: blue; }
</style>
</head>
<body>
<h2 align="center">๐ All Submitted Admission Forms</h2>
<table>
<tr>
<th>ID</th>
<th>Name</th>
<th>College</th>
<th>Course</th>
<th>Date</th>
<th>PDF</th>
</tr>
{% for s in submissions %}
<tr>
<td>{{ s[0] }}</td>
<td>{{ s[1] }}</td>
<td>{{ s[2] }}</td>
<td>{{ s[3] }}</td>
<td>{{ s[4] }}</td>
<td><a href="{{ url_for('download_pdf', filename=s[5]) }}">Download</a></td>
</tr>
{% endfor %}
</table>
<div align="center" style="margin-top:20px;">
<a href="{{ url_for('upload_page') }}">⬅️ Back to Upload</a>
</div>
</body>
</html>
๐ Run & Test
python app.py
Open browser → http://localhost:5000
✅ Features
-
Upload scanned image or PDF
-
Extracts data with OCR + Ollama
-
Auto-fills editable web form
-
Generates PDF of final data
-
Saves record to SQLite DB
-
View all records at
/submissions
with download links
Would you like me to add Admin login protection for the /submissions
page next (so only authorized users can view/download records)?
How you can create and test with real sample image/PDF files for your OCR + Ollama web app.
Since I can’t attach files directly here, I’ll give you ready-to-generate content you can copy-paste and use locally.
๐ผ️ SAMPLE IMAGE (Admission Form – Image)
1️⃣ Open any text editor (e.g., Notepad).
2️⃣ Copy this content and save it as sample_form.txt
:
Name: Anitha R
Date of Birth: 23-05-2003
Address: 12, Gandhi Street, Madurai
College Name: PSG College of Arts and Science
Course Applied: B.Sc Computer Science
Marks: 472/500
3️⃣ Now convert this text file into an image:
Option A: Use Python (quick)
from PIL import Image, ImageDraw, ImageFont
text = """Name: Anitha R
Date of Birth: 23-05-2003
Address: 12, Gandhi Street, Madurai
College Name: PSG College of Arts and Science
Course Applied: B.Sc Computer Science
Marks: 472/500"""
img = Image.new("RGB", (800, 400), color="white")
d = ImageDraw.Draw(img)
d.text((30, 30), text, fill="black")
img.save("sample_form.jpg")
print("Saved sample_form.jpg")
Run this once; it will create sample_form.jpg
— upload it in your Flask app’s upload page.
๐ SAMPLE PDF (Admission Form – PDF)
Option A: Generate using Python
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
c = canvas.Canvas("sample_form.pdf", pagesize=A4)
text = """Name: Arjun M
Date of Birth: 15-11-2002
Address: 45, Rajaji Salai, Chennai
College Name: Loyola College
Course Applied: M.Sc Data Science
Marks: 489/500"""
c.setFont("Helvetica", 12)
x, y = 80, 750
for line in text.split("\n"):
c.drawString(x, y, line)
y -= 20
c.save()
print("Saved sample_form.pdf")
After running this, you’ll have sample_form.pdf
in your working directory.
Upload this in your app — the OCR + LLM pipeline should extract data automatically.
✅ QUICK TEST PLAN
Step | Input | Expected Output |
---|---|---|
1 | Upload sample_form.jpg |
Text auto-extracted (“Anitha R”, “PSG College …”) |
2 | Upload sample_form.pdf |
Text auto-extracted (“Arjun M”, “Loyola College”) |
3 | Edit fields if needed | Submit |
4 | Go to /submissions |
Entries listed with download links |
5 | Download PDF | Check that it matches entered data |
------------------------
Some time error will come to read OCR. May be because of with out path spec. That error means your Python code is correct, but the Tesseract OCR engine itself isn’t installed or not found in your system path.
Let’s fix it step-by-step depending on your OS ๐
๐ช If you’re on Windows
1️⃣ Install Tesseract OCR
Download the Windows installer from the official source:
๐ https://github.com/UB-Mannheim/tesseract/wiki
Pick the latest version (for example, tesseract-ocr-w64-setup-5.x.x.exe
).
Run the installer and let it install to the default path, e.g.:
C:\Program Files\Tesseract-OCR\
2️⃣ Add to System PATH
-
Open System Properties → Advanced → Environment Variables
-
Under System variables, find
Path
, click Edit -
Add this entry:
C:\Program Files\Tesseract-OCR\
-
Click OK everywhere.
3️⃣ Verify installation
Open a new Command Prompt and run:
tesseract --version
If it prints version info (like tesseract v5.3.3.0
), it’s good.
4️⃣ Update your Python code (optional safety net)
Add this line before calling pytesseract.image_to_string
:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
Full example:
import pytesseract
from PIL import Image
# Explicit path on Windows
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
text = pytesseract.image_to_string(Image.open("sample_form.jpg"))
print(text)
๐ง If you’re on Linux
Install via apt:
sudo apt update
sudo apt install tesseract-ocr -y
Verify:
tesseract --version
Then you’re done — no path setting needed.
๐ If you’re on macOS
Using Homebrew:
brew install tesseract
Verify:
tesseract --version
Then run your Flask app again — it’ll work immediately.
✅ After installation
Restart your Flask app:
python app.py
Upload the sample image (sample_form.jpg
) — you should now see extracted text on screen, no Tesseract errors ๐ฏ
Successful Output : Check in http://localhost:5000/submissions as shown below:
Use Case 1 Successfully Implemented!!!