AI Video Editing: Auto-Cut with YOLO Detection + Whisper Captions
Turn 30 minutes of raw footage into a tight edit automatically using YOLO object detection for action moments and Whisper for word-level captions. Full Python pipeline included.
Manual video editing is the bottleneck that kills content pipelines. This post walks through a real automated editing system: YOLO v8 detects action moments, Whisper transcribes speech at the word level, and ffmpeg does the cutting — all stitched together in a Python pipeline that can process 30 minutes of footage in under 5 minutes.
The Core Idea
Instead of scrubbing through footage, the pipeline:
- Scans every frame for interesting action using YOLO object detection
- Scores each second of video based on detection confidence and object count
- Transcribes the audio with Whisper, getting word-level timestamps
- Identifies highlight windows using a sliding score algorithm
- Cuts and exports segments using ffmpeg, with captions burned in
No manual tagging. No timeline scrubbing. Just input a raw video file and get a tight edit out.
Setup
pip install ultralytics openai-whisper ffmpeg-python numpy pandas
# Also need ffmpeg binary
sudo apt install ffmpeg # Ubuntu/Debian
Step 1: YOLO Frame Scoring
import cv2
import numpy as np
from ultralytics import YOLO
from pathlib import Path
def score_video_frames(
video_path: str,
model_size: str = "yolov8m.pt", # n, s, m, l, x
target_classes: list = None, # None = all classes
sample_rate: int = 5, # Check every Nth frame
confidence_threshold: float = 0.4
) -> dict:
"""
Score each second of video based on YOLO detections.
Returns: {timestamp_seconds: score}
"""
model = YOLO(model_size)
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_scores = {}
frame_num = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_num % sample_rate == 0:
results = model(frame, verbose=False, conf=confidence_threshold)
score = 0
for r in results:
boxes = r.boxes
if boxes is not None:
for box in boxes:
cls_id = int(box.cls[0])
cls_name = model.names[cls_id]
conf = float(box.conf[0])
# Filter by target classes if specified
if target_classes and cls_name not in target_classes:
continue
# Score: confidence + bonus for multiple objects
score += conf * 10
timestamp = frame_num / fps
frame_scores[timestamp] = score
frame_num += 1
cap.release()
return frame_scores
# Example: score a BMX riding video, prioritize person + bicycle detection
scores = score_video_frames(
"raw_footage.mp4",
model_size="yolov8m.pt",
target_classes=["person", "bicycle", "sports ball"],
sample_rate=3,
confidence_threshold=0.35
)
print(f"Scored {len(scores)} timestamps")
print(f"Peak action at: {max(scores, key=scores.get):.1f}s (score: {max(scores.values()):.1f})")
Step 2: Whisper Transcription with Word Timestamps
import whisper
import json
def transcribe_with_timestamps(
video_path: str,
model_size: str = "base", # tiny, base, small, medium, large
language: str = None # None = auto-detect
) -> dict:
"""
Transcribe audio and return word-level timestamps.
"""
model = whisper.load_model(model_size)
result = model.transcribe(
video_path,
language=language,
word_timestamps=True, # Critical: enables per-word timing
verbose=False
)
# Flatten word timestamps
words = []
for segment in result["segments"]:
if "words" in segment:
for word in segment["words"]:
words.append({
"word": word["word"].strip(),
"start": word["start"],
"end": word["end"],
"probability": word.get("probability", 1.0)
})
return {
"text": result["text"],
"language": result["language"],
"words": words,
"segments": result["segments"]
}
transcript = transcribe_with_timestamps("raw_footage.mp4", model_size="small")
print(f"Language detected: {transcript['language']}")
print(f"Total words: {len(transcript['words'])}")
print(f"First 5 words: {[w['word'] for w in transcript['words'][:5]]}")
Step 3: Find Highlight Windows
from collections import defaultdict
import pandas as pd
def find_highlight_windows(
frame_scores: dict,
transcript_words: list,
min_duration: float = 3.0, # Minimum clip length (seconds)
max_duration: float = 15.0, # Maximum clip length
top_n: int = 10, # Number of highlights to extract
score_window: float = 2.0, # Seconds to average scores over
silence_penalty: float = 0.3 # Reduce score during silence
) -> list:
"""
Find the best N highlight windows by combining visual + speech scores.
"""
# Convert frame scores to a timeline
all_times = sorted(frame_scores.keys())
if not all_times:
return []
max_time = max(all_times)
# Build 1-second bins
timeline = defaultdict(float)
for t, score in frame_scores.items():
bin_t = int(t)
timeline[bin_t] += score
# Add speech density bonus
speech_bins = defaultdict(float)
for word in transcript_words:
bin_t = int(word["start"])
speech_bins[bin_t] += word["probability"]
# Combine scores with sliding window
combined_scores = {}
for t in range(int(max_time)):
window_score = sum(timeline.get(t + i, 0) for i in range(-1, 2))
speech_bonus = speech_bins.get(t, 0) * 2.0
combined_scores[t] = window_score + speech_bonus
# Find peaks using simple non-maximum suppression
highlights = []
used_ranges = []
sorted_times = sorted(combined_scores, key=combined_scores.get, reverse=True)
for t in sorted_times:
if len(highlights) >= top_n:
break
# Check overlap with existing highlights
start = max(0, t - 1)
end = min(max_time, t + max_duration)
overlaps = any(not (end <= s or start >= e) for s, e in used_ranges)
if overlaps:
continue
# Expand window to include speech context
# Find words that overlap with the window
window_words = [w for w in transcript_words
if w["start"] >= start and w["end"] <= end]
if window_words:
start = max(0, window_words[0]["start"] - 0.3)
end = min(max_time, window_words[-1]["end"] + 0.3)
duration = end - start
if min_duration <= duration <= max_duration:
highlights.append({
"start": round(start, 2),
"end": round(end, 2),
"duration": round(duration, 2),
"score": combined_scores[t],
"words": [w["word"] for w in window_words]
})
used_ranges.append((start, end))
# Sort by timeline order
highlights.sort(key=lambda x: x["start"])
return highlights
highlights = find_highlight_windows(
scores,
transcript["words"],
min_duration=4.0,
max_duration=12.0,
top_n=8
)
for h in highlights:
print(f" [{h['start']:.1f}s → {h['end']:.1f}s] score={h['score']:.1f} | {' '.join(h['words'][:6])}...")
Step 4: Export Clips with Burned-In Captions
import ffmpeg
import os
def export_highlight_clips(
source_video: str,
highlights: list,
transcript_words: list,
output_dir: str = "./highlights",
burn_captions: bool = True,
caption_style: str = "fontsize=36:fontcolor=white:borderw=2:bordercolor=black"
) -> list:
"""
Export highlight clips using ffmpeg, optionally burning in word-level captions.
"""
os.makedirs(output_dir, exist_ok=True)
exported = []
for i, clip in enumerate(highlights):
output_path = f"{output_dir}/highlight_{i+1:02d}_{clip['start']:.0f}s.mp4"
if burn_captions:
# Build SRT subtitle for this clip
clip_words = [w for w in transcript_words
if w["start"] >= clip["start"] and w["end"] <= clip["end"] + 0.5]
srt_path = f"{output_dir}/clip_{i+1}.srt"
with open(srt_path, "w") as f:
# Group words into subtitle chunks (~4 words each)
chunk_size = 4
for j in range(0, len(clip_words), chunk_size):
chunk = clip_words[j:j+chunk_size]
start_ts = chunk[0]["start"] - clip["start"]
end_ts = chunk[-1]["end"] - clip["start"]
text = " ".join(w["word"] for w in chunk)
# Format as SRT timestamp
def to_srt_time(t):
h = int(t // 3600)
m = int((t % 3600) // 60)
s = int(t % 60)
ms = int((t % 1) * 1000)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
f.write(f"{j//chunk_size + 1}\n")
f.write(f"{to_srt_time(max(0, start_ts))} --> {to_srt_time(end_ts)}\n")
f.write(f"{text}\n\n")
# ffmpeg: trim + burn subtitles
(
ffmpeg
.input(source_video, ss=clip["start"], to=clip["end"])
.output(
output_path,
vf=f"subtitles={srt_path}:force_style='{caption_style}'",
acodec="aac",
vcodec="libx264",
crf=18,
preset="fast"
)
.overwrite_output()
.run(quiet=True)
)
else:
(
ffmpeg
.input(source_video, ss=clip["start"], to=clip["end"])
.output(output_path, acodec="aac", vcodec="libx264", crf=18)
.overwrite_output()
.run(quiet=True)
)
print(f"✓ Exported: {output_path}")
exported.append(output_path)
return exported
clips = export_highlight_clips(
source_video="raw_footage.mp4",
highlights=highlights,
transcript_words=transcript["words"],
output_dir="./output_clips",
burn_captions=True
)
print(f"\nExported {len(clips)} clips from {len(highlights)} highlights")
Full Pipeline in One Call
def run_auto_edit_pipeline(
video_path: str,
output_dir: str = "./auto_edited",
yolo_model: str = "yolov8m.pt",
whisper_model: str = "small",
target_classes: list = None,
n_highlights: int = 8
):
print("Step 1/4: Scoring frames with YOLO...")
scores = score_video_frames(video_path, yolo_model, target_classes)
print("Step 2/4: Transcribing with Whisper...")
transcript = transcribe_with_timestamps(video_path, whisper_model)
print("Step 3/4: Finding highlights...")
highlights = find_highlight_windows(scores, transcript["words"], top_n=n_highlights)
print(f" Found {len(highlights)} highlight windows")
print("Step 4/4: Exporting clips...")
clips = export_highlight_clips(video_path, highlights, transcript["words"], output_dir)
return {"highlights": highlights, "clips": clips, "transcript": transcript}
# Run it
result = run_auto_edit_pipeline(
video_path="30min_raw_footage.mp4",
output_dir="./tiktok_clips",
target_classes=["person", "bicycle"],
n_highlights=6
)
The Video Workspace productizes this entire pipeline with 97 lines of configuration and 30+ editing methods: viral moment detection, multi-platform formatting, thumbnail generation, aspect ratio conversion, and batch processing. 30 minutes of footage → 3 platform-ready clips in 3 minutes.