Spaces:

Calcifer0323
/

matching

Sleeping

File size: 9,040 Bytes

93cd57d

"""
Скрипт для индексации всех объектов недвижимости через HuggingFace Spaces сервис

Usage:
    python index_all_properties.py          # Интерактивный режим
    python index_all_properties.py --yes    # Автоподтверждение
"""
import psycopg2
import requests
import time
import sys
from typing import List, Dict, Any

# Конфигурация БД
DB_CONFIG = {
    'host': 'dpg-d5ht8vi4d50c739akh2g-a.virginia-postgres.render.com',
    'port': 5432,
    'database': 'lead_exchange_bk',
    'user': 'lead_exchange_bk_user',
    'password': '8m2gtTRBW0iAr7nY2Aadzz0VcZBEVKYM'
}

# URL сервиса на HuggingFace Spaces
HF_SERVICE_URL = "https://calcifer0323-matching.hf.space"

def get_properties_from_db() -> List[Dict[str, Any]]:
    """Получить все объекты недвижимости из БД"""
    print("📥 Fetching properties from database...")

    conn = psycopg2.connect(**DB_CONFIG)
    cursor = conn.cursor()

    cursor.execute("""
        SELECT property_id, title, description, address, property_type,
               area, price, rooms, status
        FROM properties
        ORDER BY created_at DESC
    """)

    columns = ['property_id', 'title', 'description', 'address', 'property_type',
               'area', 'price', 'rooms', 'status']

    properties = []
    for row in cursor.fetchall():
        prop = dict(zip(columns, row))
        properties.append(prop)

    cursor.close()
    conn.close()

    print(f"✅ Fetched {len(properties)} properties")
    return properties

def prepare_text_for_property(prop: Dict[str, Any]) -> str:
    """Подготовить текст для генерации эмбеддинга"""
    parts = []

    if prop.get('title'):
        parts.append(f"Название: {prop['title']}")

    if prop.get('description'):
        parts.append(f"Описание: {prop['description']}")

    if prop.get('address'):
        parts.append(f"Адрес: {prop['address']}")

    # Добавляем структурированные данные
    details = []
    if prop.get('property_type'):
        details.append(f"тип: {prop['property_type']}")
    if prop.get('rooms'):
        details.append(f"комнат: {prop['rooms']}")
    if prop.get('area'):
        details.append(f"площадь: {prop['area']} м²")
    if prop.get('price'):
        details.append(f"цена: {prop['price']:,} ₽")

    if details:
        parts.append("Характеристики: " + ", ".join(details))

    return ". ".join(parts)

def index_batch(properties: List[Dict[str, Any]], batch_size: int = 20) -> Dict[str, Any]:
    """Индексировать батч объектов через HuggingFace Spaces"""
    items = []

    for prop in properties:
        # Подготавливаем данные для эндпоинта /batch
        item = {
            "entity_id": str(prop['property_id']),
            "title": prop.get('title', ''),
            "description": prop.get('description', ''),
            "price": float(prop['price']) if prop.get('price') else None,
            "rooms": int(prop['rooms']) if prop.get('rooms') else None,
            "area": float(prop['area']) if prop.get('area') else None,
            "address": prop.get('address', ''),
            "district": ""  # Можно извлечь из address если нужно
        }
        items.append(item)

    payload = {"items": items}

    try:
        print(f"  📤 Sending batch of {len(items)} items to {HF_SERVICE_URL}/batch")
        print(f"     Payload size: {len(str(payload))} bytes")

        response = requests.post(
            f"{HF_SERVICE_URL}/batch",
            json=payload,
            timeout=120  # 2 минуты на батч (было 5 минут, но timeout на сервере 30с)
        )

        print(f"     Response status: {response.status_code}")

        if response.status_code == 200:
            result = response.json()
            return result
        else:
            print(f"  ❌ Error: {response.status_code}")
            print(f"     Response: {response.text[:500]}")

            # Пробуем получить более детальную информацию об ошибке
            try:
                error_detail = response.json()
                print(f"     Detail: {error_detail}")
            except:
                pass

            return None

    except requests.exceptions.Timeout:
        print(f"  ❌ Request timeout (120 seconds)")
        return None
    except requests.exceptions.ConnectionError as e:
        print(f"  ❌ Connection error: {e}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"  ❌ Request failed: {e}")
        return None

def save_embeddings_to_file(results: List[Dict], filename: str = "generated_embeddings.json"):
    """Сохранить результаты индексации в файл (для проверки)"""
    import json

    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"💾 Saved embeddings to {filename}")

def main():
    print("=" * 70)
    print("INDEXING PROPERTIES THROUGH HUGGINGFACE SPACES")
    print("=" * 70)

    # Проверяем параметры командной строки
    auto_confirm = '--yes' in sys.argv or '-y' in sys.argv

    if auto_confirm:
        print("🤖 Auto-confirm mode enabled")

    # 1. Получаем объекты из БД
    properties = get_properties_from_db()

    if not properties:
        print("⚠️  No properties found in database")
        return

    print(f"\n📊 Total properties to index: {len(properties)}")

    # Показываем пример
    print(f"\n📄 Sample property:")
    sample = properties[0]
    print(f"  ID: {sample['property_id']}")
    print(f"  Title: {sample.get('title', 'N/A')}")
    print(f"  Text preview: {prepare_text_for_property(sample)[:150]}...")

    # Подтверждение
    if not auto_confirm:
        print(f"\n🚀 Ready to index {len(properties)} properties")
        print(f"   Service: {HF_SERVICE_URL}")
        print(f"   Endpoint: /batch")

        try:
            response = input("\nProceed? (yes/y/no/n): ")
            if response.lower() not in ['yes', 'y']:
                print("Cancelled by user")
                return
        except EOFError:
            print("\n❌ Error: EOF when reading input")
            print("Run with --yes flag to auto-confirm: python index_all_properties.py --yes")
            return
    else:
        print(f"\n✅ Auto-confirming indexing of {len(properties)} properties")
        print(f"   Service: {HF_SERVICE_URL}")
        print(f"   Endpoint: /batch")

    # 2. Индексируем батчами
    batch_size = 20  # Уменьшено с 50 до 20 (время обработки ~30 сек на сервере)
    total_batches = (len(properties) + batch_size - 1) // batch_size

    print(f"\n📦 Processing {total_batches} batches (batch size: {batch_size})")
    print(f"   ⏱️  Each batch will take ~30-40 seconds to process")
    print(f"   📊 Total time estimate: ~{(total_batches * 35) // 60} minutes")

    all_results = []
    successful = 0
    failed = 0

    for i in range(0, len(properties), batch_size):
        batch = properties[i:i + batch_size]
        batch_num = i // batch_size + 1

        print(f"\n🔄 Batch {batch_num}/{total_batches} ({len(batch)} items)")

        result = index_batch(batch, batch_size)

        if result:
            all_results.append(result)
            batch_successful = result.get('successful', 0)
            batch_failed = result.get('failed', 0)
            successful += batch_successful
            failed += batch_failed

            print(f"  ✅ Success: {batch_successful}/{len(batch)}")
            if batch_failed > 0:
                print(f"  ⚠️  Failed: {batch_failed}")
        else:
            print(f"  ❌ Batch failed completely")
            failed += len(batch)

        # Задержка между батчами
        if i + batch_size < len(properties):
            print(f"  ⏳ Waiting 10 seconds before next batch...")
            time.sleep(10)

    # 3. Сохраняем результаты
    if all_results:
        save_embeddings_to_file(all_results, "indexing_results.json")

    # 4. Итоги
    print("\n" + "=" * 70)
    print("INDEXING COMPLETE")
    print("=" * 70)
    print(f"✅ Successfully indexed: {successful}/{len(properties)}")
    print(f"❌ Failed: {failed}/{len(properties)}")

    if successful > 0:
        print(f"\n💡 Note: Embeddings were generated on HuggingFace Spaces")
        print(f"   Results saved to: indexing_results.json")
        print(f"   Backend should fetch these embeddings and store in DB")

    print("\n" + "=" * 70)

if __name__ == '__main__':
    main()