made decode path

This commit is contained in:
2026-02-25 00:32:17 -08:00
parent 68ccc2f4fc
commit 78852c1543
2 changed files with 112 additions and 103 deletions

112
transcode/decode.go Normal file
View File

@@ -0,0 +1,112 @@
package transcode
import (
"bufio"
"fmt"
"os/exec"
"regexp"
"strconv"
)
// ──────────────────────────────────────────────────────────────────────────────
// Silence detection
// ──────────────────────────────────────────────────────────────────────────────
// DetectSpeechSegments runs ffmpeg's silencedetect filter and returns the
// non-silent (speech) segments. A "segments" WebSocket message is broadcast
// when detection finishes.
func DetectSpeechSegments(inputPath string, noiseDb, minDuration, padding float64, broadcast func([]byte)) ([]Segment, error) {
filter := fmt.Sprintf("silencedetect=noise=%.0fdB:d=%.2f", noiseDb, minDuration)
cmd := exec.Command("ffmpeg",
"-i", inputPath,
"-af", filter,
"-f", "null", "-",
)
// ffmpeg writes silencedetect output to stderr
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, err
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("ffmpeg start: %w", err)
}
var (
silences []silenceInterval
pendingStart float64
totalDuration float64
durationRe = regexp.MustCompile(`Duration:\s+(\d+):(\d+):([0-9.]+)`)
startRe = regexp.MustCompile(`silence_start:\s*([0-9.e+\-]+)`)
endRe = regexp.MustCompile(`silence_end:\s*([0-9.e+\-]+)`)
)
scanner := bufio.NewScanner(stderr)
for scanner.Scan() {
line := scanner.Text()
if m := durationRe.FindStringSubmatch(line); m != nil {
h, _ := strconv.ParseFloat(m[1], 64)
min, _ := strconv.ParseFloat(m[2], 64)
s, _ := strconv.ParseFloat(m[3], 64)
totalDuration = h*3600 + min*60 + s
}
if m := startRe.FindStringSubmatch(line); m != nil {
pendingStart, _ = strconv.ParseFloat(m[1], 64)
}
if m := endRe.FindStringSubmatch(line); m != nil {
end, _ := strconv.ParseFloat(m[1], 64)
silences = append(silences, silenceInterval{start: pendingStart, end: end})
}
}
if err := cmd.Wait(); err != nil {
return nil, fmt.Errorf("ffmpeg: %w", err)
}
segments := removeSilence(silences, totalDuration, padding)
send(&wsMsg{
Type: "segments",
Segments: segments,
Duration: totalDuration,
}, broadcast)
return segments, nil
}
type silenceInterval struct{ start, end float64 }
// removeSilence turns silence regions into speech regions, with a small
// padding buffer so words at the edges don't get clipped.
func removeSilence(silences []silenceInterval, totalDuration, padding float64) []Segment {
if len(silences) == 0 {
return []Segment{{Start: 0, End: totalDuration}}
}
var segments []Segment
cursor := 0.0
for _, s := range silences {
segEnd := s.start + padding
if segEnd > totalDuration {
segEnd = totalDuration
}
if segEnd-cursor > 0.05 {
segments = append(segments, Segment{Start: cursor, End: segEnd})
}
next := s.end - padding
if next < segEnd {
next = segEnd
}
cursor = next
}
if cursor < totalDuration-0.01 {
segments = append(segments, Segment{Start: cursor, End: totalDuration})
}
return segments
}

View File

@@ -30,109 +30,6 @@ func send(msg *wsMsg, broadcast func([]byte)) {
broadcast(data) broadcast(data)
} }
// ──────────────────────────────────────────────────────────────────────────────
// Silence detection
// ──────────────────────────────────────────────────────────────────────────────
// DetectSpeechSegments runs ffmpeg's silencedetect filter and returns the
// non-silent (speech) segments. A "segments" WebSocket message is broadcast
// when detection finishes.
func DetectSpeechSegments(inputPath string, noiseDb, minDuration, padding float64, broadcast func([]byte)) ([]Segment, error) {
filter := fmt.Sprintf("silencedetect=noise=%.0fdB:d=%.2f", noiseDb, minDuration)
cmd := exec.Command("ffmpeg",
"-i", inputPath,
"-af", filter,
"-f", "null", "-",
)
// ffmpeg writes silencedetect output to stderr
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, err
}
if err := cmd.Start(); err != nil {
return nil, fmt.Errorf("ffmpeg start: %w", err)
}
var (
silences []silenceInterval
pendingStart float64
totalDuration float64
durationRe = regexp.MustCompile(`Duration:\s+(\d+):(\d+):([0-9.]+)`)
startRe = regexp.MustCompile(`silence_start:\s*([0-9.e+\-]+)`)
endRe = regexp.MustCompile(`silence_end:\s*([0-9.e+\-]+)`)
)
scanner := bufio.NewScanner(stderr)
for scanner.Scan() {
line := scanner.Text()
if m := durationRe.FindStringSubmatch(line); m != nil {
h, _ := strconv.ParseFloat(m[1], 64)
min, _ := strconv.ParseFloat(m[2], 64)
s, _ := strconv.ParseFloat(m[3], 64)
totalDuration = h*3600 + min*60 + s
}
if m := startRe.FindStringSubmatch(line); m != nil {
pendingStart, _ = strconv.ParseFloat(m[1], 64)
}
if m := endRe.FindStringSubmatch(line); m != nil {
end, _ := strconv.ParseFloat(m[1], 64)
silences = append(silences, silenceInterval{start: pendingStart, end: end})
}
}
if err := cmd.Wait(); err != nil {
return nil, fmt.Errorf("ffmpeg: %w", err)
}
segments := removeSilence(silences, totalDuration, padding)
send(&wsMsg{
Type: "segments",
Segments: segments,
Duration: totalDuration,
}, broadcast)
return segments, nil
}
type silenceInterval struct{ start, end float64 }
// removeSilence turns silence regions into speech regions, with a small
// padding buffer so words at the edges don't get clipped.
func removeSilence(silences []silenceInterval, totalDuration, padding float64) []Segment {
if len(silences) == 0 {
return []Segment{{Start: 0, End: totalDuration}}
}
var segments []Segment
cursor := 0.0
for _, s := range silences {
segEnd := s.start + padding
if segEnd > totalDuration {
segEnd = totalDuration
}
if segEnd-cursor > 0.05 {
segments = append(segments, Segment{Start: cursor, End: segEnd})
}
next := s.end - padding
if next < segEnd {
next = segEnd
}
cursor = next
}
if cursor < totalDuration-0.01 {
segments = append(segments, Segment{Start: cursor, End: totalDuration})
}
return segments
}
func ExportSegments( func ExportSegments(
inputPath, outputPath string, inputPath, outputPath string,
segments []Segment, segments []Segment,