Spaces:

neyugncol
/

video-chatbot

Runtime error

App Files Files Community

neyugncol commited on Jun 30, 2025

Commit

3dddfe4

verified ·

1 Parent(s): 422fca6

Add read video segment tool

Browse files

Files changed (6) hide show

app.py +94 -84
prompt.py +37 -0
rag.py +47 -7
tools.py +51 -38
transcriber.py +12 -7
utils.py +24 -0

app.py CHANGED Viewed

@@ -1,85 +1,95 @@
-import os
-import shutil
-import gradio as gr
-from smolagents import ChatMessageToolCall, ActionStep, FinalAnswerStep
-from agent import VideoChatbot
-from configs import settings
-bot = VideoChatbot(
-    model=settings.CHATBOT_MODEL,
-    api_base=settings.MODEL_BASE_API,
-    api_key=os.environ['GEMINI_API_KEY']
-)
-def chat(message: dict, history: list[dict]):
-    # move the file to the data directory
-    message['files'] = [shutil.copy(file, settings.DATA_DIR) for file in message['files']]
-    # add the input message to the history
-    history.extend([{'role': 'user', 'content': {'path': file}} for file in message['files']])
-    history.append({'role': 'user', 'content': message['text']})
-    yield history, ''
-    for step in bot.chat(message['text'], message['files']):
-        match step:
-            case ChatMessageToolCall():
-                if step.function.name == 'download_video':
-                    history.append({
-                        'role': 'assistant',
-                        'content': f'📥 Downloading video from {step.function.arguments["url"]}'
-                    })
-                elif step.function.name == 'add_video':
-                    history.append({
-                        'role': 'assistant',
-                        'content': f'🎥 Processing and adding video `{step.function.arguments["filename"]}` '
-                                   f'to the knowledge base. This may take a while...'
-                    })
-                elif step.function.name == 'search_in_video':
-                    filename = os.path.basename(bot.video_rag.videos[step.function.arguments["video_id"]]['video_path'])
-                    history.append({
-                        'role': 'assistant',
-                        'content': f'🔍 Searching in video `{filename}` '
-                                   f'for query: *{step.function.arguments.get("text_query", step.function.arguments.get("image_query", ""))}*'
-                    })
-                elif step.function.name == 'final_answer':
-                    continue
-                yield history, ''
-            case ActionStep():
-                yield history, ''
-            case FinalAnswerStep():
-                history.append({'role': 'assistant', 'content': step.output})
-                yield history, ''
-def clear_chat(chatbot):
-    chatbot.clear()
-    return chatbot, gr.update(value='')
-def main():
-    with gr.Blocks() as demo:
-        gr.Markdown('# Video Chatbot Demo')
-        gr.Markdown('This demo showcases a video chatbot that can process and search videos using '
-                    'RAG (Retrieval-Augmented Generation). You can upload videos/images or link to YouTube videos, '
-                    'ask questions, and get answers based on the video content.')
-        chatbot = gr.Chatbot(type='messages', label='Video Chatbot', height=800, resizable=True)
-        textbox = gr.MultimodalTextbox(
-            sources=['upload'],
-            file_types=['image', '.mp4'],
-            show_label=False,
-            placeholder='Type a message or upload an image/video...',
-        )
-        textbox.submit(chat, [textbox, chatbot], [chatbot, textbox])
-        clear = gr.Button('Clear Chat')
-        clear.click(clear_chat, [chatbot], [chatbot, textbox])
-    demo.launch(debug=True)
-if __name__ == '__main__':
     main()

+import os
+import shutil
+import gradio as gr
+from smolagents import ChatMessageToolCall, ActionStep, FinalAnswerStep
+import utils
+from agent import VideoChatbot
+from configs import settings
+bot = VideoChatbot(
+    model=settings.CHATBOT_MODEL,
+    api_base=settings.MODEL_BASE_API,
+    api_key=os.environ['GEMINI_API_KEY']
+)
+def chat(message: dict, history: list[dict]):
+    # move the file to the data directory
+    message['files'] = [shutil.copy(file, settings.DATA_DIR) for file in message['files']]
+    # add the input message to the history
+    history.extend([{'role': 'user', 'content': {'path': file}} for file in message['files']])
+    history.append({'role': 'user', 'content': message['text']})
+    yield history, ''
+    for step in bot.chat(message['text'], message['files']):
+        match step:
+            case ChatMessageToolCall():
+                if step.function.name == 'download_video':
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'📥 Downloading video from {step.function.arguments["url"]}'
+                    })
+                elif step.function.name == 'index_video':
+                    video_path = os.path.join(settings.DATA_DIR, step.function.arguments['filename'])
+                    video_duration = utils.seconds_to_hms(int(utils.get_media_duration(video_path)))
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'🎥 Indexing video `{step.function.arguments["filename"]}` with length *{video_duration}* '
+                                   f'to the knowledge base. This may take a while...'
+                    })
+                elif step.function.name == 'search_video_segments':
+                    filename = os.path.basename(bot.video_rag.videos[step.function.arguments["video_id"]]['video_path'])
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'🔍 Searching video segments in `{filename}` '
+                                   f'for query: *{step.function.arguments.get("text_query", step.function.arguments.get("image_query", ""))}*'
+                    })
+                elif step.function.name == 'read_video_segment':
+                    filename = os.path.basename(bot.video_rag.videos[step.function.arguments["video_id"]]['video_path'])
+                    history.append({
+                        'role': 'assistant',
+                        'content': f'📖 Reading video segment `{filename}` '
+                                   f'from *{step.function.arguments["start_time"]}* to *{step.function.arguments["end_time"]}*'
+                    })
+                elif step.function.name == 'final_answer':
+                    continue
+                yield history, ''
+            case ActionStep():
+                yield history, ''
+            case FinalAnswerStep():
+                history.append({'role': 'assistant', 'content': step.output})
+                yield history, ''
+def clear_chat(chatbot):
+    chatbot.clear()
+    return chatbot, gr.update(value='')
+def main():
+    with gr.Blocks() as demo:
+        gr.Markdown('# Video Chatbot Demo')
+        gr.Markdown('This demo showcases a video chatbot that can process and search videos using '
+                    'RAG (Retrieval-Augmented Generation). You can upload videos/images or link to YouTube videos, '
+                    'ask questions, and get answers based on the video content.')
+        chatbot = gr.Chatbot(type='messages', label='Video Chatbot', height=800, resizable=True)
+        textbox = gr.MultimodalTextbox(
+            sources=['upload'],
+            file_types=['image', '.mp4'],
+            show_label=False,
+            placeholder='Type a message or upload an image/video...',
+        )
+        textbox.submit(chat, [textbox, chatbot], [chatbot, textbox])
+        clear = gr.Button('Clear Chat')
+        clear.click(clear_chat, [chatbot], [chatbot, textbox])
+    demo.launch(debug=True)
+if __name__ == '__main__':
     main()

prompt.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import os
 def image_to_text_prompt(image_path: str, metadata: dict = None) -> str:
     """Generate a text prompt to represent a image file with its metadata."""
@@ -25,3 +27,38 @@ Filename: {os.path.basename(video_path)}
 Metadata: {metadata_lines}
 </video>
 '''

 import os
+import utils
 def image_to_text_prompt(image_path: str, metadata: dict = None) -> str:
     """Generate a text prompt to represent a image file with its metadata."""
 Metadata: {metadata_lines}
 </video>
 '''
+def video_segment_to_text_prompt(
+        start: float,
+        end: float,
+        transcript_segments: list[dict],
+        frame_paths: list[str]
+) -> str:
+    """Generate a text prompt to represent a video segment with its timespan, transcript segments, and frame images."""
+    # include timespans
+    timespan_text = f'{utils.seconds_to_hms(int(start))} - {utils.seconds_to_hms(int(end))}'
+    # include transcript segments
+    transcript_texts = []
+    for segment in transcript_segments:
+        transcript_texts.append(
+            f'- {utils.seconds_to_hms(int(segment["start"]), drop_hours=True)}'
+            f'-{utils.seconds_to_hms(int(segment["end"]), drop_hours=True)}: {segment["text"]}')
+    transcript_lines = '\n'.join(transcript_texts)
+    if transcript_lines:
+        transcript_lines = '\n' + transcript_lines
+    # include frame images
+    image_tags = []
+    for frame_path in frame_paths:
+        image_tags.append(f'<image>{frame_path}</image>')
+    frame_images_lines = '\n'.join(image_tags)
+    return f'''<video_segment>
+Timespan: {timespan_text}
+Transcript: {transcript_lines}
+{frame_images_lines}
+</video_segment>
+'''

rag.py CHANGED Viewed

@@ -76,19 +76,19 @@ class VideoRAG:
             raise ValueError(f'Video with ID {video_id} not found.')
         return self.videos[video_id]
-    def add_video(self, video_path: str) -> str:
-        """Add a video to the RAG system by processing its frames and transcripts.
         Args:
-            video_path (str): The path to the video file to be added.
         Returns:
-            str: A unique video ID generated for the added video.
         """
         # create a unique video ID
         video_id = uuid.uuid4().hex[:8]
-        print(f'Adding video "{video_path}" with ID {video_id} to the RAG system...')
         print('Extracting video frames')
         # process video frames
@@ -149,12 +149,13 @@ class VideoRAG:
         # add video metadata to the database
         self.videos[video_id] = {
             'video_path': video_path,
             'frame_dir': f'{video_path}_frames',
             'video_frame_rate': self.video_frame_rate,
             'transcript_segments': segments,
         }
-        print(f'Video "{video_path}" added with ID {video_id}.')
         return video_id
     def search(self, video_id: str, text: str = None, image: str | Image.Image = None, limit: int = 10) -> list[dict]:
@@ -274,6 +275,45 @@ class VideoRAG:
         return timespans
     def clear(self):
         """Clear the RAG system by dropping all tables and resetting video metadata."""
         self._init_db()
@@ -332,4 +372,4 @@ def merge_searched_timespans(timespans: list[dict], threshold: float) -> list[di
     # Add the last span
     merged_spans.append(current_span)
-    return merged_spans

             raise ValueError(f'Video with ID {video_id} not found.')
         return self.videos[video_id]
+    def index(self, video_path: str) -> str:
+        """Index a video file into the RAG system by extracting frames, transcribing audio, and computing embeddings.
         Args:
+            video_path (str): The path to the video file to be indexed.
         Returns:
+            str: A unique video ID generated for the indexed video.
         """
         # create a unique video ID
         video_id = uuid.uuid4().hex[:8]
+        print(f'Indexing video "{video_path}" with ID {video_id} to the RAG system...')
         print('Extracting video frames')
         # process video frames
         # add video metadata to the database
         self.videos[video_id] = {
             'video_path': video_path,
+            'video_duration': utils.get_media_duration(video_path),
             'frame_dir': f'{video_path}_frames',
             'video_frame_rate': self.video_frame_rate,
             'transcript_segments': segments,
         }
+        print(f'Video "{video_path}" indexed with ID {video_id}.')
         return video_id
     def search(self, video_id: str, text: str = None, image: str | Image.Image = None, limit: int = 10) -> list[dict]:
         return timespans
+    def read(self, video_id: str, start: float, end: float) -> dict:
+        """Read a segment of the video by its ID and time range.
+        Args:
+            video_id (str): The ID of the video to read.
+            start (float): The start time of the segment in seconds.
+            end (float): The end time of the segment in seconds.
+        Returns:
+            dict: A dictionary containing the video segment metadata, including start and end times, frame paths, and transcript segments.
+        """
+        video_metadata = self.get_video(video_id)
+        if start > video_metadata['video_duration'] or end > video_metadata['video_duration']:
+            raise ValueError(f'Start ({start}) or end ({end}) time exceeds video duration ({video_metadata["video_duration"]}).')
+        timespan = {
+            'start': start,
+            'end': end,
+            'frame_paths': [],
+            'transcript_segments': []
+        }
+        # add frame paths
+        for frame_index in range(
+                int(start * self.video_frame_rate),
+                int(end * self.video_frame_rate)
+        ):
+            timespan['frame_paths'].append(os.path.join(video_metadata['frame_dir'], f'{frame_index + 1}.jpg'))
+        # add transcript segments
+        for segment in video_metadata['transcript_segments']:
+            if utils.span_iou((segment['start'], segment['end']),
+                              (start, end)) > 0:
+                timespan['transcript_segments'].append(segment)
+        return timespan
     def clear(self):
         """Clear the RAG system by dropping all tables and resetting video metadata."""
         self._init_db()
     # Add the last span
     merged_spans.append(current_span)
+    return merged_spans

tools.py CHANGED Viewed

@@ -4,7 +4,7 @@ from smolagents import tool, Tool
 import utils
 from configs import settings
-from prompt import video_to_text_prompt
 from rag import VideoRAG
@@ -45,32 +45,32 @@ def download_video(url: str) -> str:
 def create_video_rag_tools(video_rag: VideoRAG) -> list[Tool]:
     @tool
-    def add_video(filename: str) -> str:
         """
-        Add a video file to the RAG knowledge-base for further search and analysis.
         Args:
-            filename (str): The video filename to add.
         Returns:
-            str: The video ID if added successfully, or an error message.
         """
         try:
-            video_id = video_rag.add_video(os.path.join(settings.DATA_DIR, filename))
-            return f'Video added with ID: {video_id}'
         except Exception as e:
-            return f'Error adding video: {e.__class__.__name__}: {e}'
     @tool
-    def search_in_video(video_id: str, text_query: str = None, image_query: str = None) -> str:
         """
-        Search for relevant video frames and transcripts based on text or image query. Allows searching within a specific video added to the RAG knowledge-base.
         At least one of `text_query` or `image_query` must be provided.
         The image frames of the retrieved video segments will be output at a frame rate of 1 frame per second. The order of the frames is according to the returned video segments.
         Args:
-            video_id (str): The ID of the video to search in. This should be the ID returned by `add_video`.
             text_query (str, optional): The text query to search for in the video transcripts.
             image_query (str, optional): The image query to search for in the video frames. This is the filename of the image.
@@ -79,7 +79,7 @@ def create_video_rag_tools(video_rag: VideoRAG) -> list[Tool]:
         """
         if not video_rag.is_video_exists(video_id):
-            return f'Video with ID "{video_id}" not found in the knowledge-base. Please add the video first using `add_video` tool.'
         if not text_query and not image_query:
             return 'Please provide at least one of `text_query` or `image_query` to search in the video.'
         if image_query:
@@ -101,32 +101,45 @@ def create_video_rag_tools(video_rag: VideoRAG) -> list[Tool]:
         # build the output message
         output = f'Search results for video ID {video_id}:\n'
         for result in results:
-            # include timespans
-            timespan_text = f'{utils.seconds_to_hms(int(result["start"]))} - {utils.seconds_to_hms(int(result["end"]))}'
-            # include transcript segments
-            transcript_texts = []
-            for segment in result['transcript_segments']:
-                transcript_texts.append(
-                    f'- {utils.seconds_to_hms(int(segment["start"]), drop_hours=True)}'
-                    f'-{utils.seconds_to_hms(int(segment["end"]), drop_hours=True)}: {segment["text"]}')
-            transcript_lines = '\n'.join(transcript_texts)
-            if transcript_lines:
-                transcript_lines = '\n' + transcript_lines
-            # include frame images
-            image_tags = []
-            for frame_path in result['frame_paths']:
-                image_tags.append(f'<image>{frame_path}</image>')
-            frame_images_lines = '\n'.join(image_tags)
-            output += f'''<video_segment>
-Timespan: {timespan_text}
-Transcript: {transcript_lines}
-{frame_images_lines}
-</video_segment>
-'''
         return output
-    return [add_video, search_in_video]

 import utils
 from configs import settings
+from prompt import video_to_text_prompt, video_segment_to_text_prompt
 from rag import VideoRAG
 def create_video_rag_tools(video_rag: VideoRAG) -> list[Tool]:
     @tool
+    def index_video(filename: str) -> str:
         """
+        Index a video file to the RAG knowledge-base for further search and analysis.
         Args:
+            filename (str): The video filename to index.
         Returns:
+            str: The video ID if indexed successfully, or an error message.
         """
         try:
+            video_id = video_rag.index(os.path.join(settings.DATA_DIR, filename))
+            return f'Video indexed with ID: {video_id}'
         except Exception as e:
+            return f'Error indexing video: {e.__class__.__name__}: {e}'
     @tool
+    def search_video_segments(video_id: str, text_query: str = None, image_query: str = None) -> str:
         """
+        Search for relevant video frames and transcripts based on text or image query. Allows searching within a specific video indexed to the RAG knowledge-base.
         At least one of `text_query` or `image_query` must be provided.
         The image frames of the retrieved video segments will be output at a frame rate of 1 frame per second. The order of the frames is according to the returned video segments.
         Args:
+            video_id (str): The ID of the video to search in. This should be the ID returned by `index_video`.
             text_query (str, optional): The text query to search for in the video transcripts.
             image_query (str, optional): The image query to search for in the video frames. This is the filename of the image.
         """
         if not video_rag.is_video_exists(video_id):
+            return f'Video with ID "{video_id}" not found in the knowledge-base. Please index the video first using `index_video` tool.'
         if not text_query and not image_query:
             return 'Please provide at least one of `text_query` or `image_query` to search in the video.'
         if image_query:
         # build the output message
         output = f'Search results for video ID {video_id}:\n'
         for result in results:
+            output += video_segment_to_text_prompt(
+                start=result['start'],
+                end=result['end'],
+                transcript_segments=result['transcript_segments'],
+                frame_paths=result['frame_paths']
+            )
         return output
+    def read_video_segment(video_id: str, start: str, end: str) -> str:
+        """
+        Read a specific segment of a video by its ID and time range. Use this tool when you want to read a specific segment of a video for further analysis. Don't use this tool to search for video segments, use `search_video_segments` instead. Don't read too long segments.
+        Args:
+            video_id (str): The ID of the video to read.
+            start (str): The start time in HH:MM:SS or MM:SS format. (e.g., "00:01:30" or "01:30" for 1 minute 30 seconds)
+            end (str): The end time in HH:MM:SS or MM:SS format. (e.g., "00:02:00" or "02:00" for 2 minutes)
+        Returns:
+            str: A message indicating the segment has been read or an error message if the video is not found. The output will include the video segment's timespan and the path to the video segment file.
+        """
+        if not video_rag.is_video_exists(video_id):
+            return f'Video with ID "{video_id}" not found in the knowledge-base. Please index the video first using `index_video` tool.'
+        # convert start and end to seconds
+        start_seconds = utils.hms_to_seconds(start)
+        end_seconds = utils.hms_to_seconds(end)
+        try:
+            result = video_rag.read(video_id, start_seconds, end_seconds)
+        except Exception as e:
+            return f'Error reading video segment: {e.__class__.__name__}: {e}'
+        return f'''Read video segment of video ID {video_id}:
+{video_segment_to_text_prompt(
+    start=start_seconds,
+    end=end_seconds,
+    transcript_segments=result['transcript_segments'],
+    frame_paths=result['frame_paths']
+)}'''
+    return [index_video, search_video_segments, read_video_segment]

transcriber.py CHANGED Viewed

@@ -4,6 +4,9 @@ from typing import Any
 from google import genai
 from google.genai import types
 class AudioTranscriber:
     """A class to transcribe audio files"""
@@ -33,14 +36,14 @@ Your response MUST be a valid JSON object with the following structure:
 - A "segment" is defined as a continuous section of speech from a single speaker include multiple sentences or phrases.
 - Each segment object MUST contain `text`, `start`, `end`, and `speaker` fields.
 - `text`: The verbatim transcription of the speech within that segment.
-- `start`: The precise start time of the segment in seconds, represented as a floating-point number (e.g., 0.0, 5.25).
-- `end`: The precise end time of the segment in seconds, represented as a floating-point number (e.g., 4.9, 10.12).
 - `speaker`: An integer representing the speaker ID.
   + Speaker IDs start at `0` for the first detected speaker.
   + The speaker ID MUST increment by 1 each time a new, distinct speaker is identified in the audio. Do not reuse speaker IDs within the same transcription.
   + If the same speaker talks again after another speaker, they retain their original speaker ID.
   + **Segment Splitting Rule**: A segment for the same speaker should only be split if there is a period of silence lasting more than 5 seconds. Otherwise, continuous speech from the same speaker, even with short pauses, should remain within a single segment.
 2. Language:
 - `language`: A two-letter ISO 639-1 code representing the primary language of the transcribed text (e.g., "en" for English, "es" for Spanish, "fr" for French).
 -  If multiple languages are detected in the audio, you MUST select and output only the ISO 639-1 code for the primary language used throughout the audio.
@@ -60,11 +63,11 @@ Your response MUST be a valid JSON object with the following structure:
                             'description': 'The transcribed text for the segment.'
                         },
                         'start': {
-                            'type': 'number',
                             'description': 'The start time of the segment in seconds.'
                         },
                         'end': {
-                            'type': 'number',
                             'description': 'The end time of the segment in seconds.'
                         },
                         'speaker': {
@@ -117,9 +120,11 @@ Your response MUST be a valid JSON object with the following structure:
             if uploaded_file.state == 'FAILED':
                 raise ValueError('Failed to upload the audio file')
         response = self.client.models.generate_content(
             model=self.model,
-            contents=uploaded_file,
             config=types.GenerateContentConfig(
                 system_instruction=self.SYSTEM_INSTRUCTION,
                 temperature=0.2,
@@ -131,4 +136,4 @@ Your response MUST be a valid JSON object with the following structure:
         if response.parsed is None:
             raise ValueError('Failed to transcribe the audio file')
-        return response.parsed  # type: ignore

 from google import genai
 from google.genai import types
+import utils
 class AudioTranscriber:
     """A class to transcribe audio files"""
 - A "segment" is defined as a continuous section of speech from a single speaker include multiple sentences or phrases.
 - Each segment object MUST contain `text`, `start`, `end`, and `speaker` fields.
 - `text`: The verbatim transcription of the speech within that segment.
+- `start`: The precise start time of the segment in seconds, represented as a integer number (e.g., 1, 5)
+- `end`: The precise end time of the segment in seconds, represented as a integer number (e.g., 2, 6)
 - `speaker`: An integer representing the speaker ID.
   + Speaker IDs start at `0` for the first detected speaker.
   + The speaker ID MUST increment by 1 each time a new, distinct speaker is identified in the audio. Do not reuse speaker IDs within the same transcription.
   + If the same speaker talks again after another speaker, they retain their original speaker ID.
   + **Segment Splitting Rule**: A segment for the same speaker should only be split if there is a period of silence lasting more than 5 seconds. Otherwise, continuous speech from the same speaker, even with short pauses, should remain within a single segment.
 2. Language:
 - `language`: A two-letter ISO 639-1 code representing the primary language of the transcribed text (e.g., "en" for English, "es" for Spanish, "fr" for French).
 -  If multiple languages are detected in the audio, you MUST select and output only the ISO 639-1 code for the primary language used throughout the audio.
                             'description': 'The transcribed text for the segment.'
                         },
                         'start': {
+                            'type': 'integer',
                             'description': 'The start time of the segment in seconds.'
                         },
                         'end': {
+                            'type': 'integer',
                             'description': 'The end time of the segment in seconds.'
                         },
                         'speaker': {
             if uploaded_file.state == 'FAILED':
                 raise ValueError('Failed to upload the audio file')
+        audio_duration = utils.get_media_duration(audio_path)
         response = self.client.models.generate_content(
             model=self.model,
+            contents=[uploaded_file, f'Audio duration: {int(audio_duration)} seconds'],
             config=types.GenerateContentConfig(
                 system_instruction=self.SYSTEM_INSTRUCTION,
                 temperature=0.2,
         if response.parsed is None:
             raise ValueError('Failed to transcribe the audio file')
+        return response.parsed  # type: ignore

utils.py CHANGED Viewed

@@ -3,6 +3,7 @@ import os.path
 import subprocess
 from yt_dlp import YoutubeDL
 from configs import settings
@@ -149,6 +150,16 @@ def split_media_file(file_path: str, output_dir: str, segment_length: int = 60)
     return sorted(glob.glob(f'{output_dir}/*{base_name}_*.{extension}'))
 def span_iou(span1: tuple[float, float], span2: tuple[float, float]) -> float:
     """Calculate the Intersection over Union (IoU) of two spans."""
     start1, end1 = span1
@@ -179,3 +190,16 @@ def seconds_to_hms(total_seconds: int, drop_hours: bool = False) -> str:
         return f'{minutes:02d}:{seconds:02d}'
     return f'{hours:02d}:{minutes:02d}:{seconds:02d}'

 import subprocess
 from yt_dlp import YoutubeDL
+from pymediainfo import MediaInfo
 from configs import settings
     return sorted(glob.glob(f'{output_dir}/*{base_name}_*.{extension}'))
+def get_media_duration(file_path: str) -> float:
+    """Get the duration of a media file in seconds."""
+    # use pymediainfo to get the duration
+    media_info = MediaInfo.parse(file_path)
+    for track in media_info.tracks:
+        if track.track_type == 'General':
+            return track.duration / 1000.0
+    raise ValueError(f'Could not determine duration for file: {file_path}')
 def span_iou(span1: tuple[float, float], span2: tuple[float, float]) -> float:
     """Calculate the Intersection over Union (IoU) of two spans."""
     start1, end1 = span1
         return f'{minutes:02d}:{seconds:02d}'
     return f'{hours:02d}:{minutes:02d}:{seconds:02d}'
+def hms_to_seconds(hms: str) -> int:
+    """Convert a string formatted as HH:MM:SS to total seconds."""
+    parts = hms.split(':')
+    if len(parts) == 2:  # MM:SS format
+        minutes, seconds = map(int, parts)
+        return minutes * 60 + seconds
+    elif len(parts) == 3:  # HH:MM:SS format
+        hours, minutes, seconds = map(int, parts)
+        return hours * 3600 + minutes * 60 + seconds
+    else:
+        raise ValueError('Invalid time format. Use HH:MM:SS or MM:SS.')