AI Video Editing: Auto-Cut with YOLO Detection + Whisper Captions
Back to Blog
Video· 10 min min read

AI Video Editing: Auto-Cut with YOLO Detection + Whisper Captions

Turn 30 minutes of raw footage into a tight edit automatically using YOLO object detection for action moments and Whisper for word-level captions. Full Python pipeline included.

NA
By NEPA AI
NEPA AI · Building autonomous systems for creators and businesses
#video editing#YOLO#Whisper#auto-edit#python#ffmpeg#computer vision

Manual video editing is a pain. This blog shows how to automate it with Python and some tools. YOLO v8 finds action moments, Whisper transcribes speech at word level, ffmpeg does the cutting—all stitched together for fast edits.

The Core Idea

Instead of scrubbing, the pipeline:

  1. Scans every frame using YOLO
  2. Scores each second based on detection confidence and object count
  3. Transcribes audio with Whisper, getting word-level timestamps
  4. Identifies highlight windows using a sliding score algorithm
  5. Cuts and exports segments with captions burned in

No manual tagging. No timeline scrubbing.

Setup

pip install ultralytics openai-whisper ffmpeg-python numpy pandas
# Also need ffmpeg binary
sudo apt install ffmpeg  # Ubuntu/Debian

Step 1: YOLO Frame Scoring

import cv2
import numpy as np
from ultralytics import YOLO
from pathlib import Path

def score_video_frames(
    video_path: str,
    model_size: str = "yolov8m.pt",  # n, s, m, l, x
    target_classes: list = None,      # None = all classes
    sample_rate: int = 5,             # Check every Nth frame
    confidence_threshold: float = 0.4
) -> dict:
    model = YOLO(model_size)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    frame_scores = {}
    frame_num = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_num % sample_rate == 0:
            results = model(frame, verbose=False, conf=confidence_threshold)
            
            score = 0
            for r in results:
                boxes = r.boxes
                if boxes is not None:
                    for box in boxes:
                        cls_id = int(box.cls[0])
                        cls_name = model.names[cls_id]
                        conf = float(box.conf[0])
                        
                        # Filter by target classes if specified
                        if target_classes and cls_name not in target_classes:
                            continue
                        
                        # Score: confidence + bonus for multiple objects
                        score += conf * 10
            
            timestamp = frame_num / fps
            frame_scores[timestamp] = score
        
        frame_num += 1
    
    cap.release()
    return frame_scores

# Example: prioritize person, bicycle detection
scores = score_video_frames(
    "raw_footage.mp4",
    model_size="yolov8m.pt",
    target_classes=["person", "bicycle"],
    sample_rate=3,
    confidence_threshold=0.35
)

print(f"Scored {len(scores)} timestamps")
print(f"Peak action at: {max(scores, key=scores.get):.1f}s (score: {max(scores.values()):.1f})")

Step 2: Whisper Transcription with Word Timestamps

import whisper
import json

def transcribe_with_timestamps(
    video_path: str,
    model_size: str = "small"  # tiny, base, small, medium, large
) -> dict:
    model = whisper.load_model(model_size)
    
    result = model.transcribe(
        video_path,
        word_timestamps=True  # Critical: enables per-word timing
    )
    
    words = []
    for segment in result["segments"]:
        if "words" in segment:
            for word in segment["words"]:
                words.append({
                    "word": word["word"].strip(),
                    "start": word["start"],
                    "end": word["end"],
                    "probability": word.get("probability", 1.0)
                })
    
    return {
        "text": result["text"],
        "language": result["language"],
        "words": words,
        "segments": result["segments"]
    }

transcript = transcribe_with_timestamps("raw_footage.mp4")
print(f"Language detected: {transcript['language']}")
print(f"Total words: {len(transcript['words'])}")
print(f"First 5 words: {[w['word'] for w in transcript['words'][0:5]]}")

Step 3: Find Highlight Windows

from collections import defaultdict
import pandas as pd

def find_highlight_windows(
    frame_scores: dict,
    transcript_words: list,
    min_duration: float = 3.0,    # Minimum clip length (seconds)
    max_duration: float = 15.0,   # Maximum clip length
    top_n: int = 10,              # Number of highlights to extract
    score_window: float = 2.0,    # Seconds to average scores over
    silence_penalty: float = 0.3  # Reduce score during silence
) -> list:
    all_times = sorted(frame_scores.keys())
    
    timeline = defaultdict(float)
    for t, score in frame_scores.items():
        bin_t = int(t)
        timeline[bin_t] += score
    
    speech_bins = defaultdict(float)
    for word in transcript_words:
        bin_t = int(word["start"])
        speech_bins[bin_t] += word["probability"]
    
    combined_scores = {}
    for t in range(int(max(all_times))):
        window_score = sum(timeline.get(t + i, 0) for i in range(-1, 2))
        speech_bonus = speech_bins.get(t, 0) * 2.0
        combined_scores[t] = window_score + speech_bonus
    
    highlights = []
    used_ranges = []
    sorted_times = sorted(combined_scores, key=combined_scores.get, reverse=True)
    
    for t in sorted_times:
        if len(highlights) >= top_n:
            break
        
        start = max(0, t - 1)
        end = min(int(max(all_times)), t + max_duration)
        
        overlaps = any(not (end <= s or start >= e) for s, e in used_ranges)
        if overlaps:
            continue
        
        window_words = [w for w in transcript_words 
                       if w["start"] >= start and w["end"] <= end]
        
        if window_words:
            start = max(0, window_words[0]["start"] - 0.3)
            end = min(int(max(all_times)), window_words[-1]["end"] + 0.3)
        
        duration = end - start
        if min_duration <= duration <= max_duration:
            highlights.append({
                "start": round(start, 2),
                "end": round(end, 2),
                "duration": round(duration, 2),
                "score": combined_scores[t],
                "words": [w["word"] for w in window_words]
            })
            used_ranges.append((start, end))
    
    highlights.sort(key=lambda x: x["start"])
    return highlights

highlights = find_highlight_windows(
    scores,
    transcript["words"],
    min_duration=4.0,
    max_duration=12.0,
    top_n=8
)

for h in highlights:
    print(f"  [{h['start']:.1f}s → {h['end']:.1f}s] score={h['score']:.1f} | {' '.join(h['words'][0:6])}...")

Step 4: Export Clips with Burned-In Captions

import ffmpeg
import os

def export_highlight_clips(
    source_video: str,
    highlights: list,
    transcript_words: list,
    output_dir: str = "./highlights",
    burn_captions: bool = True,
    caption_style: str = "fontsize=36:fontcolor=white:borderw=2:bordercolor=black"
) -> list:
    os.makedirs(output_dir, exist_ok=True)
    exported = []
    
    for i, clip in enumerate(highlights):
        output_path = f"{output_dir}/highlight_{i+1:02d}_{clip['start']:.0f}s.mp4"
        
        if burn_captions:
            # Build SRT subtitle
            clip_words = [w for w in transcript_words 
                         if w["start"] >= clip["start"] and w["end"] <= clip["end"] + 0.5]
            
            srt_path = f"{output_dir}/clip_{i+1}.srt"
            with open(srt_path, "w") as f:
                chunk_size = 4
                for j in range(0, len(clip_words), chunk_size):
                    chunk = clip_words[j:j+chunk_size]
                    start_ts = chunk[0]["start"] - clip["start"]
                    end_ts = chunk[-1]["end"] - clip["start"]
                    text = " ".join(w["word"] for w in chunk)
                    
                    f.write(f"{j//chunk_size + 1}\n")
                    f.write(f"{to_srt_time(max(0, start_ts))} --> {to_srt_time(end_ts)}\n")
                    f.write(f"{text}\n\n")
            
            (
                ffmpeg
                .input(source_video, ss=clip["start"], to=clip["end"])
                .output(
                    output_path,
                    vf=f"subtitles={srt_path}:force_style='{caption_style}'",
                    acodec="aac",
                    vcodec="libx264",
                    crf=18,
                    preset="fast"
                )
                .overwrite_output()
                .run(quiet=True)
            )
        else:
            (
                ffmpeg
                .input(source_video, ss=clip["start"], to=clip["end"])
                .output(output_path, acodec="aac", vcodec="libx264", crf=18)
                .overwrite_output()
                .run(quiet=True)
            )
        
        print(f"✓ Exported: {output_path}")
        exported.append(output_path)
    
    return exported

clips = export_highlight_clips(
    source_video="raw_footage.mp4",
    highlights=highlights,
    transcript_words=transcript["words"],
    output_dir="./tiktok_clips",
    burn_captions=True
)

print(f"\nExported {len(clips)} clips from {len(highlights)} highlights")

Full Pipeline in One Call

def run_auto_edit_pipeline(
    video_path: str,
    output_dir: str = "./auto_edited",
    yolo_model: str = "yolov8m.pt",
    whisper_model: str = "small",
    target_classes: list = None,
    n_highlights: int = 8
):
    print("Step 1/4: Scoring frames with YOLO...")
    scores = score_video_frames(video_path, yolo_model, target_classes)
    
    print("Step 2/4: Transcribing with Whisper...")
    transcript = transcribe_with_timestamps(video_path, whisper_model)
    
    print("Step 3/4: Finding highlights...")
    highlights = find_highlight_windows(scores, transcript["words"], top_n=n_highlights)
    print(f"  Found {len(highlights)} highlight windows")
    
    print("Step 4/4: Exporting clips...")
    clips = export_highlight_clips(video_path, highlights, transcript["words"], output_dir)
    
    return {"highlights": highlights, "clips": clips, "transcript": transcript}

# Run it
result = run_auto_edit_pipeline(
    video_path="30min_raw_footage.mp4",
    output_dir="./tiktok_clips",
    target_classes=["person", "bicycle"],
    n_highlights=6
)

The Video Workspace productizes this entire pipeline with 97 lines of config and 30+ editing methods. 30 minutes of footage → 3 platform-ready clips in 3 minutes.

→ Get Video Workspace on the Shop