Spaces:
Sleeping
Update app.py
Browse filesimport gradio as gr
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
from playwright.sync_api import sync_playwright
def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
def main_fn(url: str, check: list[str], request: gr.Request):
user_agent = request.headers["user-agent"]
with sync_playwright() as p:
browser = p.chromium.launch(
args=[
"--single-process",
"--no-zygote",
"--no-sandbox",
"--disable-gpu",
"--disable-dev-shm-usage",
"--headless=new",
]
)
context = browser.new_context(user_agent=user_agent)
page = context.new_page()
response = page.goto(url=url)
status = response.status
content = page.content()
title = page.title()
browser.close()
soup = BeautifulSoup(content, features="html.parser")
# Remove <script> e <style> corretamente
for tag in ["script", "style"]:
for t in soup.find_all(tag):
t.decompose()
# Converte conteúdo principal para Markdown
body = soup.find("body")
main = soup.find("main")
strip_tags = check # <- aplica o que o usuário selecionou
if main:
markdown = md(main, strip=strip_tags)
else:
markdown = md(body, strip=strip_tags)
return f"{title}\n======\n\n{markdown}"
demo = gr.Interface(
fn=main_fn,
title="URL para Markdown V2",
description="""
<div style="width: fit-content; margin: 0 auto;">
Este app acessa o HTML da URL informada e converte em Markdown.
Utiliza o Playwright, então funciona com páginas dinâmicas como React.
</div>
<div style="width: fit-content; margin: 0 auto;">
É possível ignorar links (<code><a></code>), imagens (<code><img></code>) e outros elementos.
</div>""",
inputs=[
gr.Text(label="URL", placeholder="https://*****"),
gr.CheckboxGroup(
label="Ignorar tags no Markdown gerado",
choices=["a", "img", "noscript"],
value=[],
),
],
outputs=[
gr.TextArea(label="Markdown gerado", show_copy_button=True)
],
allow_flagging="never",
)
demo.launch(server_name="0.0.0.0")
|
@@ -8,7 +8,7 @@ def md(soup, **options):
|
|
| 8 |
return MarkdownConverter(**options).convert_soup(soup)
|
| 9 |
|
| 10 |
|
| 11 |
-
def main_fn(url: str, check: list[
|
| 12 |
user_agent = request.headers["user-agent"]
|
| 13 |
|
| 14 |
with sync_playwright() as p:
|
|
@@ -43,7 +43,7 @@ def main_fn(url: str, check: list[int], request: gr.Request):
|
|
| 43 |
body = soup.find("body")
|
| 44 |
main = soup.find("main")
|
| 45 |
|
| 46 |
-
strip_tags =
|
| 47 |
|
| 48 |
if main:
|
| 49 |
markdown = md(main, strip=strip_tags)
|
|
@@ -62,12 +62,12 @@ demo = gr.Interface(
|
|
| 62 |
Utiliza o Playwright, então funciona com páginas dinâmicas como React.
|
| 63 |
</div>
|
| 64 |
<div style="width: fit-content; margin: 0 auto;">
|
| 65 |
-
|
| 66 |
</div>""",
|
| 67 |
inputs=[
|
| 68 |
gr.Text(label="URL", placeholder="https://*****"),
|
| 69 |
gr.CheckboxGroup(
|
| 70 |
-
label="Ignorar tags
|
| 71 |
choices=["a", "img", "noscript"],
|
| 72 |
value=[],
|
| 73 |
),
|
|
@@ -78,4 +78,4 @@ demo = gr.Interface(
|
|
| 78 |
allow_flagging="never",
|
| 79 |
)
|
| 80 |
|
| 81 |
-
demo.launch(server_name="0.0.0.0")
|
|
|
|
| 8 |
return MarkdownConverter(**options).convert_soup(soup)
|
| 9 |
|
| 10 |
|
| 11 |
+
def main_fn(url: str, check: list[str], request: gr.Request):
|
| 12 |
user_agent = request.headers["user-agent"]
|
| 13 |
|
| 14 |
with sync_playwright() as p:
|
|
|
|
| 43 |
body = soup.find("body")
|
| 44 |
main = soup.find("main")
|
| 45 |
|
| 46 |
+
strip_tags = check # <- aplica o que o usuário selecionou
|
| 47 |
|
| 48 |
if main:
|
| 49 |
markdown = md(main, strip=strip_tags)
|
|
|
|
| 62 |
Utiliza o Playwright, então funciona com páginas dinâmicas como React.
|
| 63 |
</div>
|
| 64 |
<div style="width: fit-content; margin: 0 auto;">
|
| 65 |
+
É possível ignorar links (<code><a></code>), imagens (<code><img></code>) e outros elementos.
|
| 66 |
</div>""",
|
| 67 |
inputs=[
|
| 68 |
gr.Text(label="URL", placeholder="https://*****"),
|
| 69 |
gr.CheckboxGroup(
|
| 70 |
+
label="Ignorar tags no Markdown gerado",
|
| 71 |
choices=["a", "img", "noscript"],
|
| 72 |
value=[],
|
| 73 |
),
|
|
|
|
| 78 |
allow_flagging="never",
|
| 79 |
)
|
| 80 |
|
| 81 |
+
demo.launch(server_name="0.0.0.0")
|