-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathdataAssessing.py
More file actions
251 lines (203 loc) · 8.56 KB
/
Copy pathdataAssessing.py
File metadata and controls
251 lines (203 loc) · 8.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
#!/usr/bin/env python3
"""
Python DADA2 Pipeline Implementation
Uses BioPython and subprocess to replicate DADA2 functionality
Generates python_output.fasta for comparison with R DADA2 output
"""
import os
import sys
from collections import defaultdict, Counter
import subprocess
def install_dependencies():
"""Install required Python packages"""
try:
import Bio
print("✅ BioPython already installed")
except ImportError:
print("📦 Installing BioPython...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "biopython"])
print("✅ BioPython installed")
# Install dependencies before importing Bio modules
install_dependencies()
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def quality_filter(forward_file, reverse_file, filtered_forward, filtered_reverse,
trunclen_f=240, trunclen_r=230, max_ee=2):
"""
Filter and trim reads based on quality scores
"""
print("🔬 Step 1: Filtering and trimming reads...")
filtered_count_f = 0
filtered_count_r = 0
total_count = 0
# Filter forward reads
with open(filtered_forward, 'w') as out_f:
for record in SeqIO.parse(forward_file, "fastq"):
total_count += 1
# Truncate to specified length
if len(record) < trunclen_f:
continue
record = record[:trunclen_f]
# Calculate expected errors
quals = record.letter_annotations["phred_quality"]
ee = sum(10 ** (-q / 10) for q in quals)
if ee <= max_ee and 'N' not in record.seq:
SeqIO.write(record, out_f, "fastq")
filtered_count_f += 1
# Filter reverse reads
with open(filtered_reverse, 'w') as out_r:
for record in SeqIO.parse(reverse_file, "fastq"):
# Truncate to specified length
if len(record) < trunclen_r:
continue
record = record[:trunclen_r]
# Calculate expected errors
quals = record.letter_annotations["phred_quality"]
ee = sum(10 ** (-q / 10) for q in quals)
if ee <= max_ee and 'N' not in record.seq:
SeqIO.write(record, out_r, "fastq")
filtered_count_r += 1
print(f"✅ Filtering complete!")
print(f" Total reads: {total_count}")
print(f" Forward passed: {filtered_count_f}")
print(f" Reverse passed: {filtered_count_r}\n")
return filtered_count_f, filtered_count_r
def reverse_complement(seq):
"""Return reverse complement of a sequence"""
return str(Seq(seq).reverse_complement())
def merge_pairs(filtered_forward, filtered_reverse, merged_output, min_overlap=12):
"""
Merge paired-end reads with quality scores
"""
print("🔗 Step 2: Merging paired reads...")
forward_reads = {rec.id: rec for rec in SeqIO.parse(filtered_forward, "fastq")}
reverse_reads = {rec.id: rec for rec in SeqIO.parse(filtered_reverse, "fastq")}
merged_data = [] # Store (sequence, quality_scores)
merged_count = 0
for read_id in forward_reads:
if read_id not in reverse_reads:
continue
fwd_rec = forward_reads[read_id]
rev_rec = reverse_reads[read_id]
# Reverse complement the reverse read
rev_seq = reverse_complement(str(rev_rec.seq))
fwd_seq = str(fwd_rec.seq)
fwd_qual = fwd_rec.letter_annotations["phred_quality"]
rev_qual = rev_rec.letter_annotations["phred_quality"][::-1] # Reverse quality scores
# Simple merging: find overlap or concatenate
merged_seq = None
merged_qual = None
for overlap_len in range(min(len(fwd_seq), len(rev_seq)), min_overlap - 1, -1):
if fwd_seq[-overlap_len:] == rev_seq[:overlap_len]:
merged_seq = fwd_seq + rev_seq[overlap_len:]
# Merge quality scores: average in overlap, concatenate rest
overlap_qual = [(f + r) // 2 for f, r in zip(fwd_qual[-overlap_len:], rev_qual[:overlap_len])]
merged_qual = fwd_qual[:-overlap_len] + overlap_qual + rev_qual[overlap_len:]
break
if merged_seq is None:
# No overlap found, skip this pair
continue
merged_data.append((merged_seq, merged_qual))
merged_count += 1
print(f"✅ Merging complete: {merged_count} sequences merged\n")
return merged_data
def denoise_sequences(merged_data):
"""
Denoise sequences by clustering identical or near-identical sequences
This is a simplified version of DADA2's denoising algorithm
Returns unique sequences with their average quality scores
"""
print("🧬 Step 3: Denoising sequences...")
# Group by sequence and average quality scores
seq_dict = defaultdict(list)
for seq, qual in merged_data:
seq_dict[seq].append(qual)
# Keep unique sequences with averaged quality scores
denoised = []
for seq, qual_list in seq_dict.items():
# Average quality scores across all instances of this sequence
avg_qual = [int(sum(q[i] for q in qual_list) / len(qual_list))
for i in range(len(qual_list[0]))]
denoised.append((seq, avg_qual))
print(f"✅ Denoising complete: {len(denoised)} unique sequences (ASVs)\n")
return denoised
def remove_chimeras(denoised_data, min_fold=1.5):
"""
Simple chimera detection
Real DADA2 uses more sophisticated algorithms
Keeps quality scores with non-chimeric sequences
"""
print("🧹 Step 4: Removing chimeras...")
# Simple abundance-based filtering
# Real chimera detection is much more complex
non_chimeric = []
for seq, qual in denoised_data:
# Keep all sequences for now (real implementation would check chimeric patterns)
non_chimeric.append((seq, qual))
print(f"✅ Chimera removal complete: {len(non_chimeric)} sequences retained\n")
return non_chimeric
def write_fastq(sequence_data, output_file):
"""
Write sequences with quality scores to FASTQ file
"""
print("💾 Step 5: Writing output...")
records = []
for i, (seq, qual) in enumerate(sequence_data, 1):
record = SeqRecord(
Seq(seq),
id=f"seq{i}",
description="",
letter_annotations={"phred_quality": qual}
)
records.append(record)
SeqIO.write(records, output_file, "fastq")
print(f"✅ Output written to: {output_file}\n")
def main():
"""
Main Python DADA2 pipeline
"""
# Check if running with command line arguments
if len(sys.argv) >= 6:
forward_file = sys.argv[1]
reverse_file = sys.argv[2]
reads_folder = sys.argv[3]
plots_folder = sys.argv[4]
results_folder = sys.argv[5]
else:
# Kaggle paths
forward_file = "/kaggle/input/crc-raw-reads/SRR29923448_1.fastq"
reverse_file = "/kaggle/input/crc-raw-reads/SRR29923448_2.fastq"
reads_folder = "/kaggle/working/filtered/reads"
plots_folder = "/kaggle/working/filtered/plots"
results_folder = "/kaggle/working/filtered/results"
print("=" * 60)
print("Python DADA2 Pipeline")
print("=" * 60)
print(f"📁 Forward reads: {forward_file}")
print(f"📁 Reverse reads: {reverse_file}")
print(f"📁 Output folder: {results_folder}\n")
# Check dependencies
install_dependencies()
# Create output directories
os.makedirs(reads_folder, exist_ok=True)
os.makedirs(results_folder, exist_ok=True)
# File paths for filtered reads
filtered_forward = os.path.join(reads_folder, "python_filtered_F.fastq")
filtered_reverse = os.path.join(reads_folder, "python_filtered_R.fastq")
# Step 1: Quality filtering
quality_filter(forward_file, reverse_file, filtered_forward, filtered_reverse)
# Step 2: Merge paired reads
merged_data = merge_pairs(filtered_forward, filtered_reverse, None)
# Step 3: Denoise
denoised_data = denoise_sequences(merged_data)
# Step 4: Remove chimeras
final_data = remove_chimeras(denoised_data)
# Step 5: Write output
output_file = os.path.join(results_folder, "python_output.fastq")
write_fastq(final_data, output_file)
print("🎉 Python DADA2 pipeline complete!")
print(f"📄 Final output: {output_file}")
print(f"📊 Total ASVs: {len(final_data)}")
if __name__ == "__main__":
main()