-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDownload.py
More file actions
104 lines (84 loc) · 3.12 KB
/
Copy pathDownload.py
File metadata and controls
104 lines (84 loc) · 3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from datasets import load_dataset
# 加载 MusicCaps 数据集
dataset = load_dataset("google/MusicCaps", split="train")
# 查看数据集的前几个样本
print(dataset[0])
import subprocess
import os
from pathlib import Path
import re
def sanitize_filename(filename):
"""清理非法文件名字符"""
return re.sub(r'[<>:"/\\|?*]', '_', filename)
def download_clip(video_id, output_path, start_time, end_time, tmp_dir="./tmp", num_attempts=3):
"""
下载指定的 YouTube 音频片段。
"""
tmp_dir = Path(tmp_dir)
tmp_dir.mkdir(parents=True, exist_ok=True) # 确保临时目录存在
output_path = Path(output_path)
output_path = output_path.parent / sanitize_filename(output_path.name) # 清理文件名
output_path.parent.mkdir(parents=True, exist_ok=True) # 确保输出目录存在
if start_time >= end_time:
raise ValueError(f"开始时间 ({start_time}) 必须小于结束时间 ({end_time})")
command = [
"yt-dlp",
"--cookies", "cookies.txt",
"--quiet",
"--no-warnings",
"--force-keyframes-at-cuts",
"-x",
"--audio-format", "wav",
"-f", "bestaudio",
"-o", str(output_path),
"--download-sections", f"*{start_time}-{end_time}",
f"https://www.youtube.com/watch?v={video_id}"
]
attempts = 0
while attempts < num_attempts:
try:
subprocess.run(command, check=True)
if output_path.exists():
return True
except subprocess.CalledProcessError as e:
print(f"下载失败,重试中 ({attempts + 1}/{num_attempts})...")
attempts += 1
return False
from pathlib import Path
def download_musiccaps(data_dir, startAt=0):
"""
下载 MusicCaps 数据集中的音频片段。
参数:
data_dir (str): 保存音频的目录。
limit (int): 限制下载的样本数量(可选)。
"""
# 加载数据集
dataset = load_dataset("google/MusicCaps", split="train")
dataset = dataset.select(range(startAt,len(dataset),1))
# 创建保存目录
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)
for i, example in enumerate(dataset):
video_id = example["ytid"]
start_time = example["start_s"]
end_time = example["end_s"]
output_path = data_dir / f"{video_id}.wav"
if output_path.exists():
print(f"[{i+startAt+1}] 已存在,跳过: {output_path}")
continue
print(f"[{i+startAt+1}] 下载: {video_id} -> {output_path}")
success = download_clip(
video_id=video_id,
output_path=str(output_path),
start_time=start_time,
end_time=end_time
)
if not success:
print(f"下载失败: {video_id}")
download_musiccaps(data_dir="./musiccaps_data",startAt=0)
# Resampling
# from pydub import AudioSegment
# def resample_audio(file_path, target_sample_rate=16000):
# audio = AudioSegment.from_file(file_path)
# audio = audio.set_frame_rate(target_sample_rate)
# audio.export(file_path, format="wav")