-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathspeaker_id_onboarding.py
More file actions
79 lines (65 loc) · 3.36 KB
/
Copy pathspeaker_id_onboarding.py
File metadata and controls
79 lines (65 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python3
"""
Speaker verification onboarding example.
This mirrors the React Native onboarding flow at a high level:
- collect multiple microphone samples
- create speaker embeddings
- save enrollment JSON for later verification / wake-word gating
"""
from __future__ import annotations
import argparse
import keyword_detection
import os
from pathlib import Path
from keyword_detection import SpeakerVerification
DEFAULT_SAMPLE_RATE = 16000
DEFAULT_FRAME_SIZE = 1280
DEFAULT_SAMPLE_SECONDS = 2.0
DEFAULT_SAMPLE_COUNT = 5
def build_parser() -> argparse.ArgumentParser:
script_dir = Path(__file__).resolve().parent
parser = argparse.ArgumentParser(description="Create a speaker-verification enrollment JSON from microphone audio.")
parser.add_argument("--output", default=str(script_dir / "sv_enrollment.json"), help="Where to write the enrollment JSON.")
parser.add_argument("--enrollment-id", default="davoice", help="Enrollment id stored in the JSON.")
parser.add_argument("--sample-count", type=int, default=DEFAULT_SAMPLE_COUNT, help="Number of enrollment samples to collect.")
parser.add_argument("--sample-seconds", type=float, default=DEFAULT_SAMPLE_SECONDS, help="Seconds to record per enrollment sample. iOS/RN currently use 2.0; try 0.5 to experiment.")
parser.add_argument("--sample-rate", type=int, default=DEFAULT_SAMPLE_RATE, help="Microphone sample rate.")
parser.add_argument("--frame-size", type=int, default=DEFAULT_FRAME_SIZE, help="Microphone frame size.")
return parser
def main() -> None:
parser = build_parser()
args = parser.parse_args()
script_dir = Path(__file__).resolve().parent
license_path = script_dir / "licensekey.txt"
license_key = license_path.read_text(encoding="utf-8").strip()
sv = SpeakerVerification()
print(f"license key is {license_key}")
sv.set_speaker_verification_license(license_key)
controller = sv.SpeakerVerificationMicController(
sample_rate=args.sample_rate,
frame_size=args.frame_size,
)
print("Speaker verification onboarding")
print(f"Sample count : {args.sample_count}")
print(f"Sample duration : {args.sample_seconds:.2f}s")
print(f"Sample rate : {args.sample_rate}")
print(f"Frame size : {args.frame_size}")
print(f"Output JSON : {args.output}")
print(f"keyword_detection package path: {keyword_detection.__file__}")
print(f"DAVOICE_SV_DEBUG={os.environ.get('DAVOICE_SV_DEBUG', '')}")
print("")
print("Press Enter before each enrollment sample and speak clearly.")
embeddings = []
for index in range(1, args.sample_count + 1):
input(f"\n[{index}/{args.sample_count}] Press Enter to record sample {index}...")
print(f"Recording sample {index}/{args.sample_count} for {args.sample_seconds:.2f}s...")
embeddings.append(controller.create_enrollment_embedding_from_mic(sample_seconds=args.sample_seconds))
print(f"Collected sample {index}/{args.sample_count}.")
enrollment_json = controller.create_enrollment_json(args.enrollment_id, embeddings)
output_path = Path(args.output).expanduser().resolve()
output_path.write_text(enrollment_json, encoding="utf-8")
print("")
print(f"Saved enrollment JSON to: {output_path}")
print("You can reuse it with speaker_id_verification.py or wakeword_with_speaker_id.py.")
if __name__ == "__main__":
main()