-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_conllu_boilerplate.py
More file actions
187 lines (156 loc) · 6.09 KB
/
Copy pathfilter_conllu_boilerplate.py
File metadata and controls
187 lines (156 loc) · 6.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env python3
"""
Filter large CoNLL-U files by removing sentences whose text starts with boilerplate.
Usage:
python filter_conllu_boilerplate.py input.conllu output.conllu [--patterns file.txt]
The script streams the file line by line, so it works with files too large
to fit in memory (tens of GBs).
Matching is simple substring/prefix matching (not regex) - if the sentence
text STARTS WITH any of the patterns, the entire sentence is removed.
"""
import argparse
import sys
from pathlib import Path
# Default boilerplate patterns to filter (sentence text prefixes)
DEFAULT_PATTERNS = [
"frontiers-fpsyg-corpus.txt Journal Information",
"Journal ID (publisher-id):",
"Psychology Journal Abbreviation:",
"Psychology ISSN:",
"Publisher: Frontiers Research Foundation",
"Article Information",
"Copyright",
"open-access:",
"Received Day:",
"Accepted Day:",
"Electronic publication date:",
"collection publication date:",
"Volume:",
"DOI:",
"[doi:",
# Common journal boilerplate
"Front.",
# Page separators (long underscore lines)
"____",
"______",
"________________________________________________________________",
"____________________________________________________________________________________",
]
def load_patterns(pattern_file: str) -> list[str]:
"""Load patterns from a file, one per line."""
patterns = []
with open(pattern_file, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line and not line.startswith("#"):
patterns.append(line)
return patterns
def matches_boilerplate(text: str, patterns: list[str]) -> bool:
"""
Check if the text starts with any of the boilerplate patterns.
Simple prefix matching - if text starts with any pattern, return True.
"""
for pattern in patterns:
if text.startswith(pattern):
return True
return False
def extract_text_from_record(record_lines: list[str]) -> str | None:
"""Extract the # text = value from a CoNLL-U record."""
for line in record_lines:
if line.startswith("# text = "):
return line[9:].rstrip("\r\n") # Remove '# text = ' and trailing whitespace
return None
def filter_conllu_stream(
input_path: str,
output_path: str,
patterns: list[str],
buffer_size: int = 8192
) -> tuple[int, int]:
"""
Stream-filter a CoNLL-U file, removing records whose text starts with boilerplate.
Args:
input_path: Path to input CoNLL-U file
output_path: Path to output filtered CoNLL-U file
patterns: List of string prefixes to filter
buffer_size: Buffer size for reading lines
Returns:
Tuple of (total_records, removed_records)
"""
total = 0
removed = 0
current_record: list[str] = []
with open(input_path, "r", encoding="utf-8", buffering=buffer_size) as infile, \
open(output_path, "w", encoding="utf-8", buffering=buffer_size) as outfile:
for line in infile:
# Check if this is a record separator (empty line)
if line.strip() == "":
# Process the completed record
if current_record:
text = extract_text_from_record(current_record)
if text is None or not matches_boilerplate(text, patterns):
# Keep this record (add empty line after for proper format)
outfile.write("".join(current_record))
outfile.write("\n")
total += 1
else:
removed += 1
current_record = []
else:
current_record.append(line)
# Handle last record (file might not end with empty line)
if current_record:
text = extract_text_from_record(current_record)
if text is None or not matches_boilerplate(text, patterns):
# Keep this record (add empty line after for proper format)
outfile.write("".join(current_record))
outfile.write("\n")
total += 1
else:
removed += 1
return total, removed
def main():
parser = argparse.ArgumentParser(
description="Filter large CoNLL-U files by removing boilerplate sentences."
)
parser.add_argument(
"input", help="Input CoNLL-U file path"
)
parser.add_argument(
"output", help="Output CoNLL-U file path"
)
parser.add_argument(
"--patterns", "-p",
help="File containing patterns (one per line), or use default patterns"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Show progress information"
)
args = parser.parse_args()
# Load patterns
if args.patterns:
patterns = load_patterns(args.patterns)
if args.verbose:
print(f"Loaded {len(patterns)} patterns from {args.patterns}", file=sys.stderr)
else:
patterns = DEFAULT_PATTERNS
if args.verbose:
print(f"Using {len(patterns)} default patterns", file=sys.stderr)
# Verify files exist
if not Path(args.input).exists():
sys.exit(f"Error: Input file '{args.input}' not found")
# Check if input equals output (dangerous)
if Path(args.input).resolve() == Path(args.output).resolve():
sys.exit("Error: Input and output files are the same - aborting to prevent data loss")
# Process the file
if args.verbose:
print(f"Processing {args.input} -> {args.output}...", file=sys.stderr)
total, removed = filter_conllu_stream(args.input, args.output, patterns)
if args.verbose:
print(f"Done. Kept {total} records, removed {removed} boilerplate records.", file=sys.stderr)
else:
# Minimal output for scripting
print(f"{total}\t{removed}")
if __name__ == "__main__":
main()