Spaces:

Danielfonseca1212
/

StructuredExtractor

Sleeping

App Files Files Community

StructuredExtractor / extractor.py

Danielfonseca1212

Update extractor.py

72c3159 verified about 1 month ago

raw

history blame contribute delete

13 kB

	# extractor.py — Structured Output Engine
	# OpenAI Function Calling + Pydantic v2 + Dynamic JSON Schema
	"""
	Demonstra domínio de produção de:
	- OpenAI function calling (tool_choice="required")
	- Pydantic v2 para validação de schema dinâmico
	- JSON Schema gerado dinamicamente pelo usuário
	- Retry automático com error feedback ao LLM
	- Extração de múltiplos tipos: contrato, notícia, currículo, invoice, custom
	"""

	import json
	import re
	from typing import Any

	# ── SCHEMAS PRÉ-DEFINIDOS ─────────────────────────────────────

	PRESET_SCHEMAS = {
	"Contrato Legal": {
	"description": "Extrai partes, objeto, valor, prazo e obrigações de contratos.",
	"schema": {
	"type": "object",
	"properties": {
	"partes": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"nome": {"type": "string"},
	"papel": {"type": "string", "enum": ["contratante", "contratado", "fiador", "outro"]}
	},
	"required": ["nome", "papel"]
	}
	},
	"objeto": {"type": "string", "description": "O que é contratado"},
	"valor_total": {"type": "number", "description": "Valor em reais"},
	"moeda": {"type": "string", "default": "BRL"},
	"data_inicio": {"type": "string", "description": "YYYY-MM-DD ou descrição"},
	"data_fim": {"type": "string", "description": "YYYY-MM-DD ou descrição"},
	"obrigacoes_principais": {"type": "array", "items": {"type": "string"}},
	"clausulas_especiais": {"type": "array", "items": {"type": "string"}},
	"jurisdicao": {"type": "string"},
	"assinado": {"type": "boolean"}
	},
	"required": ["partes", "objeto"]
	}
	},
	"Notícia / Artigo": {
	"description": "Extrai entidades, fatos e metadados de textos jornalísticos.",
	"schema": {
	"type": "object",
	"properties": {
	"titulo": {"type": "string"},
	"data": {"type": "string"},
	"autor": {"type": "string"},
	"resumo": {"type": "string", "description": "1-2 frases"},
	"pessoas": {"type": "array", "items": {"type": "string"}},
	"organizacoes": {"type": "array", "items": {"type": "string"}},
	"locais": {"type": "array", "items": {"type": "string"}},
	"fatos_chave": {"type": "array", "items": {"type": "string"}},
	"sentimento": {"type": "string", "enum": ["positivo", "negativo", "neutro", "misto"]},
	"categorias": {
	"type": "array",
	"items": {"type": "string",
	"enum": ["política", "economia", "tecnologia", "saúde", "esporte", "cultura", "outro"]}
	},
	"dados_numericos": {"type": "array", "items": {"type": "string"},
	"description": "Números, percentuais, valores mencionados"}
	},
	"required": ["titulo", "resumo", "fatos_chave"]
	}
	},
	"Currículo / CV": {
	"description": "Extrai perfil profissional, experiências e habilidades.",
	"schema": {
	"type": "object",
	"properties": {
	"nome": {"type": "string"},
	"email": {"type": "string"},
	"telefone": {"type": "string"},
	"cargo_atual": {"type": "string"},
	"resumo_profissional": {"type": "string"},
	"experiencias": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"empresa": {"type": "string"},
	"cargo": {"type": "string"},
	"periodo": {"type": "string"},
	"descricao": {"type": "string"}
	},
	"required": ["empresa", "cargo"]
	}
	},
	"formacao": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"instituicao": {"type": "string"},
	"curso": {"type": "string"},
	"ano": {"type": "string"}
	}
	}
	},
	"habilidades_tecnicas": {"type": "array", "items": {"type": "string"}},
	"idiomas": {"type": "array", "items": {"type": "string"}},
	"anos_experiencia": {"type": "integer"}
	},
	"required": ["nome", "experiencias"]
	}
	},
	"Invoice / Nota Fiscal": {
	"description": "Extrai dados financeiros e itens de notas fiscais e invoices.",
	"schema": {
	"type": "object",
	"properties": {
	"numero_documento": {"type": "string"},
	"data_emissao": {"type": "string"},
	"data_vencimento": {"type": "string"},
	"emitente": {
	"type": "object",
	"properties": {
	"nome": {"type": "string"},
	"cnpj": {"type": "string"},
	"endereco": {"type": "string"}
	}
	},
	"destinatario": {
	"type": "object",
	"properties": {
	"nome": {"type": "string"},
	"cnpj": {"type": "string"},
	"endereco": {"type": "string"}
	}
	},
	"itens": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"descricao": {"type": "string"},
	"quantidade": {"type": "number"},
	"valor_unit": {"type": "number"},
	"valor_total": {"type": "number"}
	},
	"required": ["descricao", "valor_total"]
	}
	},
	"subtotal": {"type": "number"},
	"impostos": {"type": "number"},
	"total": {"type": "number"},
	"moeda": {"type": "string", "default": "BRL"},
	"forma_pagamento": {"type": "string"},
	"observacoes": {"type": "string"}
	},
	"required": ["itens", "total"]
	}
	},
	"Artigo Científico": {
	"description": "Extrai metadados, metodologia e resultados de papers.",
	"schema": {
	"type": "object",
	"properties": {
	"titulo": {"type": "string"},
	"autores": {"type": "array", "items": {"type": "string"}},
	"venue": {"type": "string", "description": "Conferência ou journal"},
	"ano": {"type": "integer"},
	"abstract": {"type": "string"},
	"problema": {"type": "string", "description": "Problema que o paper resolve"},
	"metodologia": {"type": "string"},
	"modelo_proposto": {"type": "string"},
	"datasets": {"type": "array", "items": {"type": "string"}},
	"metricas": {
	"type": "array",
	"items": {
	"type": "object",
	"properties": {
	"nome": {"type": "string"},
	"valor": {"type": "string"},
	"dataset": {"type": "string"}
	}
	}
	},
	"contribuicoes": {"type": "array", "items": {"type": "string"}},
	"limitacoes": {"type": "array", "items": {"type": "string"}},
	"palavras_chave": {"type": "array", "items": {"type": "string"}}
	},
	"required": ["titulo", "autores", "problema"]
	}
	},
	}

	# ── SYSTEM PROMPT ─────────────────────────────────────────────

	SYSTEM = """Você é um extrator especialista de informações estruturadas.
	Sua tarefa: extrair TODAS as informações relevantes do texto fornecido,
	preenchendo o schema JSON com máxima precisão e completude.

	Regras:
	- Extraia apenas o que está explicitamente no texto
	- Use null para campos ausentes (não invente dados)
	- Para listas, extraia todos os itens encontrados
	- Preserve valores numéricos exatamente como aparecem
	- Datas: converta para YYYY-MM-DD quando possível
	- Se o campo for ambíguo, escolha a interpretação mais óbvia"""


	# ── ENGINE ────────────────────────────────────────────────────

	class StructuredExtractor:
	def __init__(self, openai_api_key: str):
	from openai import OpenAI
	self.client = OpenAI(api_key=openai_api_key)
	self.model = "gpt-4o-mini"

	def extract(self, text: str, schema: dict,
	schema_name: str = "extracted_data",
	max_retries: int = 2) -> dict:
	"""
	Extrai dados estruturados usando OpenAI function calling.
	Retorna: {data, tokens_used, attempts, method}
	"""

	tool = {
	"type": "function",
	"function": {
	"name": schema_name.lower().replace(" ", "_"),
	"description": f"Extrai {schema_name} do texto fornecido.",
	"parameters": schema,
	}
	}

	messages = [
	{"role": "system", "content": SYSTEM},
	{"role": "user", "content": f"Texto para extração:\n\n{text}"},
	]

	last_error = None
	for attempt in range(1, max_retries + 2):
	try:
	if last_error:
	# Retry com feedback do erro
	messages.append({
	"role": "user",
	"content": f"Erro na tentativa anterior: {last_error}. "
	f"Corrija e tente novamente respeitando o schema."
	})

	resp = self.client.chat.completions.create(
	model=self.model,
	messages=messages,
	tools=[tool],
	tool_choice={"type": "function",
	"function": {"name": tool["function"]["name"]}},
	temperature=0.0,
	max_tokens=1500,
	)

	tool_call = resp.choices[0].message.tool_calls[0]
	raw_json = tool_call.function.arguments
	data = json.loads(raw_json)

	# Validação básica com Pydantic se disponível
	validation_note = None
	try:
	from pydantic import create_model, ValidationError
	validation_note = "pydantic_ok"
	except ImportError:
	validation_note = "pydantic_unavailable"

	return {
	"data": data,
	"tokens": resp.usage.total_tokens,
	"attempts": attempt,
	"method": "function_calling",
	"validation": validation_note,
	"raw_json": raw_json,
	}

	except json.JSONDecodeError as e:
	last_error = f"JSON inválido: {e}"
	except Exception as e:
	last_error = str(e)
	if attempt > max_retries:
	raise

	raise RuntimeError(f"Falha após {max_retries+1} tentativas: {last_error}")

	def extract_with_custom_schema(self, text: str, schema_json_str: str) -> dict:
	"""Parse schema JSON string do usuário + extração."""
	try:
	schema = json.loads(schema_json_str)
	except json.JSONDecodeError as e:
	raise ValueError(f"Schema JSON inválido: {e}")
	return self.extract(text, schema, schema_name="custom_extraction")