File size: 3,577 Bytes
0b02ad1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
from huggingface_hub import InferenceClient, auth_check
from deep_translator import GoogleTranslator
from PIL import Image
from gradio.themes import Base
import os
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError

# Fetch the API token from environment variable
hf_api_token = os.getenv("HF_API_TOKEN")

# Hugging Face Inference API client
client = InferenceClient(token=hf_api_token)

# Supported languages for translation (aligned with deep_translator)
languages = {
    "English": "en",
    "Hindi": "hi",
    "Tamil": "ta",
    "Spanish": "es",
    "French": "fr",
    "German": "de",
    "Bengali": "bn",
    "Telugu": "te",
    "Marathi": "mr",
}

# Check token access to the model
def check_model_access():
    try:
        auth_check("Salesforce/blip-image-captioning-large", token=hf_api_token)
        return "Token has access to the model."
    except GatedRepoError:
        return "Error: Token does not have permission to access this gated repository."
    except RepositoryNotFoundError:
        return "Error: The repository was not found or you do not have access."
    except Exception as e:
        return f"Error checking access: {str(e)}"

# Print access check result (for debugging)
print(check_model_access())

def generate_caption(image, target_language_name):
    try:
        # Map the selected language name to its code
        target_language = languages.get(target_language_name)
        if not target_language:
            return f"Error: Selected language '{target_language_name}' is not supported. Please choose from: {list(languages.keys())}"

        # Convert PIL image to bytes for API
        from io import BytesIO
        img_byte_arr = BytesIO()
        image.save(img_byte_arr, format='PNG')
        img_byte_arr = img_byte_arr.getvalue()

        # Use Hugging Face Inference API for captioning
        result = client.image_to_text(
            image=img_byte_arr,
            model="Salesforce/blip-image-captioning-large"
        )

        # Extract the generated text from the ImageToTextOutput object
        english_caption = result.generated_text

        # If target language is English, return as is
        if target_language == "en":
            return english_caption
        
        # Translate to the selected local language
        translator = GoogleTranslator(source='en', target=target_language)
        local_caption = translator.translate(english_caption)
        
        return local_caption
    
    except Exception as e:
        return f"Error: {str(e)}"

# Custom theme
custom_theme = gr.themes.Default(
    primary_hue="blue",
    secondary_hue="gray",
    neutral_hue="slate",
    text_size="lg",
    radius_size="md",
    font=[gr.themes.GoogleFont("Roboto"), "sans-serif"]
)

# Gradio interface
interface = gr.Interface(
    fn=generate_caption,
    inputs=[
        gr.Image(type="pil", label="Upload an Image"),
        gr.Dropdown(
            choices=list(languages.keys()),
            label="Select Language",
            value="English"
        )
    ],
    outputs=gr.Textbox(label="Caption", lines=2, placeholder="Caption will appear here..."),
    title="Image Caption Generator with Language Selection",
    description="Upload an image and select a local language to get a caption.",
    theme=custom_theme,
    css="""
        .gradio-container { max-width: 800px; margin: auto; }
        h1 { text-align: center; color: #1E40AF; }
        .label { font-weight: bold; }
        input, output { border-radius: 8px; }
    """
)

interface.launch()