| import os |
| import re |
| import subprocess |
| import numpy as np |
| from PIL import Image |
| import gradio as gr |
| import torch |
| from transformers import AutoProcessor, Florence2ForConditionalGeneration |
|
|
|
|
| |
| model_name = "PJMixers-Images/Florence-2-base-Castollux-v0.5-JPEG-Quality-Detection-v0.1" |
| model = Florence2ForConditionalGeneration.from_pretrained(model_name).eval() |
| processor = AutoProcessor.from_pretrained(model_name) |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| model.to(device) |
|
|
| TITLE = f"# [{model_name}](https://huggingface.co/{model_name})" |
|
|
|
|
| def process_image(image, num_beams=5, min_p=0.0, top_p=1.0): |
| """ |
| Process a single image to generate a caption. |
| Supports image input as file path, numpy array, or PIL Image. |
| Generation settings (num_beams, min_p, top_p) can be customized. |
| """ |
| try: |
| |
| if isinstance(image, np.ndarray): |
| image = Image.fromarray(image) |
| elif isinstance(image, str): |
| image = Image.open(image) |
| if image.mode != "RGB": |
| image = image.convert("RGB") |
|
|
| |
| inputs = processor( |
| text="<JPEG_QUALITY>", |
| images=image, |
| return_tensors="pt", |
| padding=False, |
| truncation=False |
| ) |
|
|
| |
| inputs = {k: v.to(device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| generated_ids = model.generate( |
| input_ids=inputs["input_ids"], |
| pixel_values=inputs["pixel_values"], |
| max_new_tokens=8, |
| num_beams=num_beams, |
| do_sample=True, |
| top_p=top_p, |
| min_p=min_p, |
| ) |
|
|
| |
| return processor.batch_decode( |
| generated_ids, |
| skip_special_tokens=False |
| )[0].replace('</s>', '').replace('<s>', '').replace('<pad>', '').strip() |
|
|
| except Exception as e: |
| return f"Error processing image: {e}" |
|
|
|
|
| |
| css = """ |
| #output { height: 500px; overflow: auto; border: 1px solid #ccc; } |
| """ |
|
|
| with gr.Blocks(css=css) as demo: |
| gr.Markdown(TITLE) |
|
|
| with gr.Tab(label="Single Image Processing"): |
| with gr.Row(): |
| with gr.Column(): |
| input_img = gr.Image(label="Input Picture") |
|
|
| with gr.Column(): |
| output_text = gr.Textbox(label="Output Text") |
|
|
| submit_btn = gr.Button(value="Submit") |
|
|
| num_beams_slider = gr.Slider( |
| minimum=1, |
| maximum=5, |
| step=1, |
| value=5, |
| label="Number of Beams" |
| ) |
| min_p_slider = gr.Slider( |
| minimum=0, |
| maximum=1, |
| step=0.01, |
| value=0.0, |
| label="Min-P" |
| ) |
| top_p_slider = gr.Slider( |
| minimum=0, |
| maximum=1, |
| step=0.01, |
| value=1.0, |
| label="Top-P" |
| ) |
|
|
| gr.Examples( |
| [ |
| ["000001_q25.jpg", 5, 0.0, 1.0], |
| ["000002_q20.jpg", 5, 0.0, 1.0], |
| ["000014_q50.jpg", 5, 0.0, 1.0], |
| ["000017_q20.jpg", 5, 0.0, 1.0], |
| ["000032_q0.jpg", 5, 0.0, 1.0], |
| ["000035_q15.jpg", 5, 0.0, 1.0], |
| ["000035_q70.jpg", 5, 0.0, 1.0], |
| ["0003_q95.jpg", 5, 0.0, 1.0], |
| ["0021_q35.jpg", 5, 0.0, 1.0], |
| ["0037_q0.jpg", 5, 0.0, 1.0], |
| ["0040_q0.jpg", 5, 0.0, 1.0], |
| ], |
| inputs=[input_img, num_beams_slider, min_p_slider, top_p_slider], |
| outputs=[output_text], |
| fn=process_image, |
| label="Try captioning on below examples", |
| ) |
|
|
| submit_btn.click( |
| process_image, |
| [input_img, num_beams_slider, min_p_slider, top_p_slider], |
| [output_text] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch(debug=True) |
|
|