13ze commited on
Commit
638f4a4
·
verified ·
1 Parent(s): 8b998b9

Update app.py

Browse files

import gradio as gr
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
from playwright.sync_api import sync_playwright


def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)


def main_fn(url: str, check: list[str], request: gr.Request):
user_agent = request.headers["user-agent"]

with sync_playwright() as p:
browser = p.chromium.launch(
args=[
"--single-process",
"--no-zygote",
"--no-sandbox",
"--disable-gpu",
"--disable-dev-shm-usage",
"--headless=new",
]
)
context = browser.new_context(user_agent=user_agent)
page = context.new_page()

response = page.goto(url=url)
status = response.status
content = page.content()
title = page.title()

browser.close()

soup = BeautifulSoup(content, features="html.parser")

# Remove <script> e <style> corretamente
for tag in ["script", "style"]:
for t in soup.find_all(tag):
t.decompose()

# Converte conteúdo principal para Markdown
body = soup.find("body")
main = soup.find("main")

strip_tags = check # <- aplica o que o usuário selecionou

if main:
markdown = md(main, strip=strip_tags)
else:
markdown = md(body, strip=strip_tags)

return f"{title}\n======\n\n{markdown}"


demo = gr.Interface(
fn=main_fn,
title="URL para Markdown V2",
description="""
<div style="width: fit-content; margin: 0 auto;">
Este app acessa o HTML da URL informada e converte em Markdown.
Utiliza o Playwright, então funciona com páginas dinâmicas como React.
</div>
<div style="width: fit-content; margin: 0 auto;">
É possível ignorar links (<code><a></code>), imagens (<code><img></code>) e outros elementos.
</div>""",
inputs=[
gr.Text(label="URL", placeholder="https://*****"),
gr.CheckboxGroup(
label="Ignorar tags no Markdown gerado",
choices=["a", "img", "noscript"],
value=[],
),
],
outputs=[
gr.TextArea(label="Markdown gerado", show_copy_button=True)
],
allow_flagging="never",
)

demo.launch(server_name="0.0.0.0")

Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -8,7 +8,7 @@ def md(soup, **options):
8
  return MarkdownConverter(**options).convert_soup(soup)
9
 
10
 
11
- def main_fn(url: str, check: list[int], request: gr.Request):
12
  user_agent = request.headers["user-agent"]
13
 
14
  with sync_playwright() as p:
@@ -43,7 +43,7 @@ def main_fn(url: str, check: list[int], request: gr.Request):
43
  body = soup.find("body")
44
  main = soup.find("main")
45
 
46
- strip_tags = [] # Extração completa, sem ignorar imagens nem links
47
 
48
  if main:
49
  markdown = md(main, strip=strip_tags)
@@ -62,12 +62,12 @@ demo = gr.Interface(
62
  Utiliza o Playwright, então funciona com páginas dinâmicas como React.
63
  </div>
64
  <div style="width: fit-content; margin: 0 auto;">
65
- O conteúdo é convertido para Markdown de forma limpa e copiável.
66
  </div>""",
67
  inputs=[
68
  gr.Text(label="URL", placeholder="https://*****"),
69
  gr.CheckboxGroup(
70
- label="Ignorar tags (sem efeito - tudo será extraído)",
71
  choices=["a", "img", "noscript"],
72
  value=[],
73
  ),
@@ -78,4 +78,4 @@ demo = gr.Interface(
78
  allow_flagging="never",
79
  )
80
 
81
- demo.launch(server_name="0.0.0.0")
 
8
  return MarkdownConverter(**options).convert_soup(soup)
9
 
10
 
11
+ def main_fn(url: str, check: list[str], request: gr.Request):
12
  user_agent = request.headers["user-agent"]
13
 
14
  with sync_playwright() as p:
 
43
  body = soup.find("body")
44
  main = soup.find("main")
45
 
46
+ strip_tags = check # <- aplica o que o usuário selecionou
47
 
48
  if main:
49
  markdown = md(main, strip=strip_tags)
 
62
  Utiliza o Playwright, então funciona com páginas dinâmicas como React.
63
  </div>
64
  <div style="width: fit-content; margin: 0 auto;">
65
+ É possível ignorar links (<code>&lt;a&gt;</code>), imagens (<code>&lt;img&gt;</code>) e outros elementos.
66
  </div>""",
67
  inputs=[
68
  gr.Text(label="URL", placeholder="https://*****"),
69
  gr.CheckboxGroup(
70
+ label="Ignorar tags no Markdown gerado",
71
  choices=["a", "img", "noscript"],
72
  value=[],
73
  ),
 
78
  allow_flagging="never",
79
  )
80
 
81
+ demo.launch(server_name="0.0.0.0")