From 78852c1543e40a30d09c65e21750b8e55e91ef7a Mon Sep 17 00:00:00 2001 From: Twopic2 Date: Wed, 25 Feb 2026 00:32:17 -0800 Subject: [PATCH] made decode path --- transcode/decode.go | 112 +++++++++++++++++++++++++++++++++++++++++ transcode/transcode.go | 103 ------------------------------------- 2 files changed, 112 insertions(+), 103 deletions(-) create mode 100644 transcode/decode.go diff --git a/transcode/decode.go b/transcode/decode.go new file mode 100644 index 0000000..3c58f8a --- /dev/null +++ b/transcode/decode.go @@ -0,0 +1,112 @@ +package transcode + +import ( + "bufio" + "fmt" + "os/exec" + "regexp" + "strconv" +) + +// ────────────────────────────────────────────────────────────────────────────── +// Silence detection +// ────────────────────────────────────────────────────────────────────────────── + +// DetectSpeechSegments runs ffmpeg's silencedetect filter and returns the +// non-silent (speech) segments. A "segments" WebSocket message is broadcast +// when detection finishes. +func DetectSpeechSegments(inputPath string, noiseDb, minDuration, padding float64, broadcast func([]byte)) ([]Segment, error) { + filter := fmt.Sprintf("silencedetect=noise=%.0fdB:d=%.2f", noiseDb, minDuration) + + cmd := exec.Command("ffmpeg", + "-i", inputPath, + "-af", filter, + "-f", "null", "-", + ) + + // ffmpeg writes silencedetect output to stderr + stderr, err := cmd.StderrPipe() + if err != nil { + return nil, err + } + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("ffmpeg start: %w", err) + } + + var ( + silences []silenceInterval + pendingStart float64 + totalDuration float64 + + durationRe = regexp.MustCompile(`Duration:\s+(\d+):(\d+):([0-9.]+)`) + startRe = regexp.MustCompile(`silence_start:\s*([0-9.e+\-]+)`) + endRe = regexp.MustCompile(`silence_end:\s*([0-9.e+\-]+)`) + ) + + scanner := bufio.NewScanner(stderr) + for scanner.Scan() { + line := scanner.Text() + + if m := durationRe.FindStringSubmatch(line); m != nil { + h, _ := strconv.ParseFloat(m[1], 64) + min, _ := strconv.ParseFloat(m[2], 64) + s, _ := strconv.ParseFloat(m[3], 64) + totalDuration = h*3600 + min*60 + s + } + if m := startRe.FindStringSubmatch(line); m != nil { + pendingStart, _ = strconv.ParseFloat(m[1], 64) + } + if m := endRe.FindStringSubmatch(line); m != nil { + end, _ := strconv.ParseFloat(m[1], 64) + silences = append(silences, silenceInterval{start: pendingStart, end: end}) + } + } + + if err := cmd.Wait(); err != nil { + return nil, fmt.Errorf("ffmpeg: %w", err) + } + + segments := removeSilence(silences, totalDuration, padding) + + send(&wsMsg{ + Type: "segments", + Segments: segments, + Duration: totalDuration, + }, broadcast) + + return segments, nil +} + +type silenceInterval struct{ start, end float64 } + +// removeSilence turns silence regions into speech regions, with a small +// padding buffer so words at the edges don't get clipped. +func removeSilence(silences []silenceInterval, totalDuration, padding float64) []Segment { + if len(silences) == 0 { + return []Segment{{Start: 0, End: totalDuration}} + } + + var segments []Segment + cursor := 0.0 + + for _, s := range silences { + segEnd := s.start + padding + if segEnd > totalDuration { + segEnd = totalDuration + } + if segEnd-cursor > 0.05 { + segments = append(segments, Segment{Start: cursor, End: segEnd}) + } + next := s.end - padding + if next < segEnd { + next = segEnd + } + cursor = next + } + + if cursor < totalDuration-0.01 { + segments = append(segments, Segment{Start: cursor, End: totalDuration}) + } + + return segments +} diff --git a/transcode/transcode.go b/transcode/transcode.go index 4e8b10e..2cd9ef3 100644 --- a/transcode/transcode.go +++ b/transcode/transcode.go @@ -30,109 +30,6 @@ func send(msg *wsMsg, broadcast func([]byte)) { broadcast(data) } -// ────────────────────────────────────────────────────────────────────────────── -// Silence detection -// ────────────────────────────────────────────────────────────────────────────── - -// DetectSpeechSegments runs ffmpeg's silencedetect filter and returns the -// non-silent (speech) segments. A "segments" WebSocket message is broadcast -// when detection finishes. -func DetectSpeechSegments(inputPath string, noiseDb, minDuration, padding float64, broadcast func([]byte)) ([]Segment, error) { - filter := fmt.Sprintf("silencedetect=noise=%.0fdB:d=%.2f", noiseDb, minDuration) - - cmd := exec.Command("ffmpeg", - "-i", inputPath, - "-af", filter, - "-f", "null", "-", - ) - - // ffmpeg writes silencedetect output to stderr - stderr, err := cmd.StderrPipe() - if err != nil { - return nil, err - } - if err := cmd.Start(); err != nil { - return nil, fmt.Errorf("ffmpeg start: %w", err) - } - - var ( - silences []silenceInterval - pendingStart float64 - totalDuration float64 - - durationRe = regexp.MustCompile(`Duration:\s+(\d+):(\d+):([0-9.]+)`) - startRe = regexp.MustCompile(`silence_start:\s*([0-9.e+\-]+)`) - endRe = regexp.MustCompile(`silence_end:\s*([0-9.e+\-]+)`) - ) - - scanner := bufio.NewScanner(stderr) - for scanner.Scan() { - line := scanner.Text() - - if m := durationRe.FindStringSubmatch(line); m != nil { - h, _ := strconv.ParseFloat(m[1], 64) - min, _ := strconv.ParseFloat(m[2], 64) - s, _ := strconv.ParseFloat(m[3], 64) - totalDuration = h*3600 + min*60 + s - } - if m := startRe.FindStringSubmatch(line); m != nil { - pendingStart, _ = strconv.ParseFloat(m[1], 64) - } - if m := endRe.FindStringSubmatch(line); m != nil { - end, _ := strconv.ParseFloat(m[1], 64) - silences = append(silences, silenceInterval{start: pendingStart, end: end}) - } - } - - if err := cmd.Wait(); err != nil { - return nil, fmt.Errorf("ffmpeg: %w", err) - } - - segments := removeSilence(silences, totalDuration, padding) - - send(&wsMsg{ - Type: "segments", - Segments: segments, - Duration: totalDuration, - }, broadcast) - - return segments, nil -} - -type silenceInterval struct{ start, end float64 } - -// removeSilence turns silence regions into speech regions, with a small -// padding buffer so words at the edges don't get clipped. -func removeSilence(silences []silenceInterval, totalDuration, padding float64) []Segment { - if len(silences) == 0 { - return []Segment{{Start: 0, End: totalDuration}} - } - - var segments []Segment - cursor := 0.0 - - for _, s := range silences { - segEnd := s.start + padding - if segEnd > totalDuration { - segEnd = totalDuration - } - if segEnd-cursor > 0.05 { - segments = append(segments, Segment{Start: cursor, End: segEnd}) - } - next := s.end - padding - if next < segEnd { - next = segEnd - } - cursor = next - } - - if cursor < totalDuration-0.01 { - segments = append(segments, Segment{Start: cursor, End: totalDuration}) - } - - return segments -} - func ExportSegments( inputPath, outputPath string, segments []Segment,