Lazabriellholland commited on
Commit
9c275fa
·
verified ·
1 Parent(s): 77422b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -87
app.py CHANGED
@@ -1,88 +1,4 @@
1
- import os, io, re, json
2
  import streamlit as st
3
- from PIL import Image
4
-
5
- import pdfplumber # text layer for PDFs
6
- from pdf2image import convert_from_bytes # render image-only PDFs
7
- import pytesseract # OCR
8
-
9
- st.set_page_config(page_title="Flyer → Where & When", page_icon="📰", layout="centered")
10
- st.title("📰 Flyer → Where & When")
11
- st.caption("Upload a flyer (PDF/JPG/PNG). I’ll read it and pull out the date, time, and location.")
12
-
13
- DATE_PAT = r"(\b\w{3,9}\b\s+\d{1,2}(?:,\s*\d{4})?)|\b\d{1,2}/\d{1,2}/\d{2,4}\b"
14
- TIME_PAT = r"\b(\d{1,2}:\d{2}\s*[AP]M|\d{1,2}\s*[AP]M)\b"
15
- LOC_HINTS = r"(Location|Where|Address|Venue|At)[:\-\s]+(.+)"
16
-
17
- def pdf_bytes_to_text(pdf_bytes: bytes) -> str:
18
- # Try selectable text first (more accurate)
19
- parts = []
20
- try:
21
- with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
22
- for page in pdf.pages:
23
- t = page.extract_text() or ""
24
- if t.strip():
25
- parts.append(t)
26
- except Exception:
27
- pass
28
- if parts:
29
- return "\n\n".join(parts)
30
- # Fallback to OCR (image-only PDF)
31
- txts = []
32
- for img in convert_from_bytes(pdf_bytes, fmt="png", dpi=200):
33
- txts.append(pytesseract.image_to_string(img))
34
- return "\n".join(txts)
35
-
36
- def image_bytes_to_text(img_bytes: bytes) -> str:
37
- img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
38
- return pytesseract.image_to_string(img)
39
-
40
- def extract_where_when(text: str):
41
- # Dates (flatten alternation tuples)
42
- dates_raw = re.findall(DATE_PAT, text, flags=re.I)
43
- dates = []
44
- for tup in dates_raw:
45
- s = [t for t in tup if t]
46
- if s: dates.append(s[0])
47
- times = re.findall(TIME_PAT, text, flags=re.I)
48
- m = re.search(LOC_HINTS, text, flags=re.I)
49
- location = m.group(2).strip() if m else ""
50
- return {"dates_found": dates, "times_found": times, "location": location}
51
-
52
- uploaded = st.file_uploader("Upload flyer", type=["pdf", "png", "jpg", "jpeg"])
53
- manual_text = st.text_area("Or paste text manually (optional)", height=140)
54
-
55
- if st.button("Read flyer"):
56
- if not uploaded and not manual_text.strip():
57
- st.warning("Please upload a file or paste some text.")
58
- st.stop()
59
-
60
- raw = ""
61
- if uploaded:
62
- if uploaded.type == "application/pdf":
63
- raw = pdf_bytes_to_text(uploaded.read())
64
- else:
65
- raw = image_bytes_to_text(uploaded.read())
66
-
67
- combined_text = manual_text.strip() or raw
68
- if not combined_text.strip():
69
- st.error("I couldn’t read any text. Try exporting your Canva flyer as Standard PDF or a larger PNG.")
70
- st.stop()
71
-
72
- results = extract_where_when(combined_text)
73
- st.success("Done! Here’s what I found:")
74
- c1, c2 = st.columns(2)
75
- with c1:
76
- st.markdown("**Date(s):**")
77
- st.write(", ".join(results["dates_found"]) if results["dates_found"] else "—")
78
- st.markdown("**Time(s):**")
79
- st.write(", ".join(results["times_found"]) if results["times_found"] else "—")
80
- with c2:
81
- st.markdown("**Location:**")
82
- st.write(results["location"] or "—")
83
-
84
- st.markdown("---")
85
- st.markdown("**All extracted text (for reference):**")
86
- st.code(combined_text[:3000])
87
- st.download_button("Download JSON", data=json.dumps(results, indent=2),
88
- file_name="where_when.json", mime="application/json")
 
 
1
  import streamlit as st
2
+ st.set_page_config(page_title="OK", page_icon="✅")
3
+ st.title("✅ Streamlit + Docker is running")
4
+ st.write("If you can see this page, your Docker Space is healthy.")