Babel/config.yaml at main · NCATSTranslator/Babel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# Information about Babel
babel:
  github_url: "https://github.com/NCATSTranslator/Babel"

# Build information. Currently unstructured -- you can use this to write down any notes on what is going on with
# this specific build of Babel.
build:
  branch: babel-1.17
  workarounds:
  - DrugBank downloads are currently disabled https://github.com/cthoyt/drugbank-downloader/issues/23 -- I used the previously downloaded 5-1-13.

# Versions that need to be updated on every release.
biolink_version: "b0d9ef6494af9b3ab931e9505d446ca8c212f50f" # The next release after Biolink 4.4.2
umls_version: "2026AA"
rxnorm_version: "06012026"
drugbank_version: "5-1-13" # Really 5-1-21, but since downloads are currently not allowed, we need to reuse our most recent download.

# Overall inputs and outputs.
input_directory: input_data
download_directory: babel_downloads
intermediate_directory: babel_outputs/intermediate
output_directory: babel_outputs
# Where DuckDB spills larger-than-memory intermediates. setup_duckdb() gives each job its own
# duckdb-$SLURM_JOB_ID subdirectory here. Override per run with the BABEL_DUCKDB_TEMP_DIR
# environment variable; see slurm/README.md ("Temporary Scratch Space") for the trade-offs.
tmp_directory: babel_downloads/tmp

# HTTP settings.
http: {}

# SPARQL query settings.
sparql:
  # Total number of attempts (initial try + retries) before giving up; must be >= 1.
  # Example: max_attempts=3 → one initial try, then up to two retries.
  max_attempts: 3
  # Base delay in seconds between attempts; must be >= 0. Actual delay grows exponentially
  # (attempt 1 → 1 s, attempt 2 → 2 s, attempt 3 → 4 s, …). Accepts fractional values.
  retry_base_delay_seconds: 1

# Maps Python compendium names (as used in src/createcompendia/ and tests) to the
# directory names that Snakemake uses under intermediate_directory.
# Compendia not listed here use their own name as the directory name.
# Update this whenever a new semantic type uses a shortened or different directory name.
compendium_directories:
  diseasephenotype: disease
  processactivitypathway: process

#
# SHARED
#

# DuckDB settings for use in all DuckDB connections.
# write_buffer_row_group_count=1: flush Parquet row groups to disk eagerly. The default of 5
# means DuckDB holds 5 row groups × threads × ~4 GiB in write buffers before flushing (~76 GiB
# peak at 128 GB allocation). Setting it to 1 keeps write-buffer RAM proportional to one row
# group, at no cost to compression ratio or query performance.
#
# enable_external_file_cache=false: DuckDB's external file cache keeps Parquet file blocks mmapped
# across queries, which is one source of memory mappings. The cross-compendium report rules were
# hitting the kernel's per-process mapping limit (vm.max_map_count, default 65530) and failing with
# `bad allocation` on a small allocation despite free RAM -- an address-space (mmap-count) limit.
# Disabling this cache removes it as a mapping source, but note it was NOT sufficient on its own:
# the dominant source is DuckDB's buffer pool (its allocator retains ~1.3 MB mappings up to the
# query's peak memory), so the report rules also cap memory_limit far below their natural peak. The
# definitive fix is for the cluster to raise vm.max_map_count (issue #846); until then we keep this
# off (it only costs a re-read from disk for our read-once/scan-twice queries, no correctness change).
duckdb_config:
  write_buffer_row_group_count: 1
  enable_external_file_cache: false

#
# UMLS
#
umls:
  subset: "full"
  # Replace with "level-0" to get only the level 0 subset (https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html).
  # subset: "level-0"

#
# PROTEINS
#

# Chris Bizon prepared a list of UMLS/UniProtKB mappings which we download and use.
UMLS_UniProtKB_download_raw_url: "https://raw.githubusercontent.com/cbizon/UMLS_UniProtKB/refs/heads/main/outputs/UMLS_UniProtKB.tsv"

#
# The rest of these configs need to be cleaned up.
#

ncbi_files:
  - gene2ensembl.gz
  - gene_info.gz
  - gene_orthologs.gz
  - gene_refseq_uniprotkb_collab.gz
  - mim2gene_medgen

generate_dirs_for_labels_and_synonyms_prefixes: [GO, CL, NCIT, UBERON, CHEBI, HP, MONDO, PR]

# TODO: This doesn't appear to be used anywhere -- but it might be neat to use this list to make sure we got all of them.
ubergraph_ontologies: [UBERON, CL, GO, NCIT, ECO, ECTO, ENVO, HP, UPHENO, BFO, BSPO, CARO, CHEBI, CP, GOREL, IAO, MAXO, MONDO, PATO, PR, RO, UBPROP]

mods: [WormBase, FB, MGI, ZFIN, RGD, SGD]

common:
  labels: [ubergraph/labels]
  synonyms: [ubergraph/synonyms.jsonl]
  descriptions: [ubergraph/descriptions.jsonl]

anatomy_prefixes:  [UBERON, GO, CL, UMLS, MESH, NCIT, SNOMEDCT]
anatomy_ids:       [UBERON, GO, CL, UMLS, MESH, NCIT]
anatomy_concords:  [UBERON, GO, CL, UMLS, WIKIDATA]
anatomy_outputs:   [AnatomicalEntity.txt, Cell.txt, CellularComponent.txt, GrossAnatomicalStructure.txt]
# Prefixes whose identifiers are treated as globally unique within a clique: if merging two
# cliques would produce a clique containing more than one identifier with one of these prefixes,
# the merge is blocked.  UBERON and GO are authoritative enough that two distinct terms from
# either ontology should never be collapsed into the same equivalence set.  If you add a new
# anatomy source whose concord file links to UBERON or GO, verify that it doesn't introduce
# spurious merges before removing either prefix from this list.
anatomy_unique_prefixes: [UBERON, GO]

gene_labels:   [HGNC, NCBIGene, UMLS, OMIM]
gene_ids:      [ENSEMBL, HGNC, NCBIGene, UMLS, OMIM, ZFIN, WormBase, FB, MGI, RGD, SGD]
gene_concords: [NCBIGene, NCBIGeneENSEMBL, medgen, UMLS, UMLS_NCBIGene]
gene_outputs:  [Gene.txt]

protein_labels:   [UniProtKB, PR, UMLS]
protein_synonyms: [PR, UMLS]
protein_ids:      [ENSEMBL, MESH, UniProtKB, PR, UMLS]
protein_concords: [UniProtKB, PR, NCIT_UniProtKB, NCIT_UMLS, UMLS, UMLS_UniProtKB]
protein_outputs:  [Protein.txt]

disease_labelsandsynonyms: [MONDO, DOID, Orphanet, HP, MESH, NCIT, UMLS, SNOMEDCT, EFO]
disease_ids:               [MONDO, DOID, Orphanet, HP, MESH, NCIT, UMLS, OMIM, EFO]
disease_concords:          [HP, MONDO, UMLS, DOID, EFO, Manual]
disease_outputs:           [Disease.txt, PhenotypicFeature.txt]

process_labels:   [GO, REACT, RHEA, EC, SMPDB, PANTHER.PATHWAY, UMLS]
process_ids:      [GO, REACT, RHEA, EC, SMPDB, PANTHER.PATHWAY, UMLS]
process_concords: [GO, RHEA, UMLS]
process_outputs:  [Pathway.txt, BiologicalProcess.txt, MolecularActivity.txt]

unichem_datasources: [CHEMBL.COMPOUND, DRUGBANK, GTOPDB, CHEBI, UNII, HMDB, PUBCHEM.COMPOUND, DrugCentral]  # KEGG.COMPOUND removed from UniChem — https://github.com/NCATSTranslator/Babel/issues/834

chemical_labels:    [CHEMBL.COMPOUND, GTOPDB, CHEBI, UNII, HMDB, PUBCHEM.COMPOUND, DrugCentral, UMLS, DRUGBANK]  # KEGG.COMPOUND removed from UniChem — https://github.com/NCATSTranslator/Babel/issues/834
chemical_synonyms:  [GTOPDB, CHEBI, UNII, HMDB, PUBCHEM.COMPOUND, UMLS, DRUGBANK]
chemical_concords:  [wikipedia_mesh_chebi, PUBCHEM_MESH, mesh_cas, mesh_unii, PUBCHEM_CAS, GTOPDB, CHEBI, UMLS, DrugCentral, RXNORM]
chemical_ids:       [CHEMBL.COMPOUND, GTOPDB, CHEBI, UNII, HMDB, PUBCHEM.COMPOUND, DrugCentral, DRUGBANK, MESH, UMLS, RXNORM]  # KEGG.COMPOUND removed from UniChem — https://github.com/NCATSTranslator/Babel/issues/834
chemical_outputs:   [MolecularMixture.txt, SmallMolecule.txt, Polypeptide.txt, ComplexMolecularMixture.txt, ChemicalEntity.txt, ChemicalMixture.txt, Drug.txt]

drugchemicalconflated_synonym_outputs: [DrugChemicalConflated.txt]
geneproteinconflated_synonym_outputs:  [GeneProteinConflated.txt]

taxon_labels:   [NCBITaxon, MESH, UMLS]
taxon_synonyms: [NCBITaxon, UMLS]
taxon_ids:      [NCBITaxon, MESH, UMLS]
taxon_concords: [NCBI_MESH, UMLS]
taxon_outputs:  [OrganismTaxon.txt]

cell_line_outputs: [CellLine.txt]

genefamily_labels:  [PANTHER.FAMILY, HGNC.FAMILY]
genefamily_ids:     [PANTHER.FAMILY, HGNC.FAMILY]
genefamily_outputs: [GeneFamily.txt]

umls_outputs: [umls.txt]

macromolecularcomplex_outputs: [MacromolecularComplex.txt]

ubergraph_iri_stem_to_prefix_map:
  "https://identifiers.org/ncbigene/": NCBIGene
  "http://www.ncbi.nlm.nih.gov/gene/": NCBIGene
  "http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=": HGNC
  "http://www.informatics.jax.org/marker/MGI:": MGI
  "http://www.pombase.org/spombe/result/": PomBase
  "http://www.wormbase.org/species/c_elegans/gene/": WormBase
  "http://flybase.org/reports/": FB
  "http://www.ecogene.org/gene/": ecogene
  "http://zfin.org/action/marker/view/": ZFIN
  "http://birdgenenames.org/cgnc/GeneReport?id=": cgnc
  "http://www.ensemblgenomes.org/id/": ENSEMBL
  "http://www.ensembl.org/id/": ENSEMBL
  "http://purl.obolibrary.org/obo/AISM_": AISM
  "http://purl.obolibrary.org/obo/BCO_": BCO
  "http://purl.obolibrary.org/obo/CDNO_": CDNO
  "http://purl.obolibrary.org/obo/CLAO_": CLAO
  "http://purl.obolibrary.org/obo/GNO_": GNO
  "http://purl.obolibrary.org/obo/HAO_": HAO
  "http://purl.obolibrary.org/obo/LEPAO_": LEPAO
  "http://purl.obolibrary.org/obo/MMO_": MMO
  "http://purl.obolibrary.org/obo/MRO_": MRO
  "http://purl.obolibrary.org/obo/MmusDv_": MmusDv
  "http://purl.obolibrary.org/obo/OARCS_": OARCS
  "http://purl.obolibrary.org/obo/OBA_": OBA
  "http://purl.obolibrary.org/obo/PCL_": PCL
  "http://purl.obolibrary.org/obo/PECO_": PECO
  "http://purl.obolibrary.org/obo/PPO_": PPO
  "http://purl.obolibrary.org/obo/TS_": TS
  "http://purl.obolibrary.org/obo/UO_": UO
  "http://purl.obolibrary.org/obo/PCO_": PCO
  "http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=": RGD

publication_outputs: [Publication.txt]

geneprotein_outputs: [GeneProtein.txt]

drugchemical_outputs: [DrugChemical.txt]

preferred_name_boost_prefixes:
  biolink:ChemicalEntity: [DRUGBANK, DrugCentral, CHEBI, MESH, GTOPDB]

ensembl_datasets_to_skip:
#  - elucius_gene_ensembl
#  - hgfemale_gene_ensembl
#  - charengus_gene_ensembl
#  - otshawytscha_gene_ensembl
#  - aocellaris_gene_ensembl
  - omykiss_gene_ensembl        # 2026jun5: keeps getting stuck with this dataset for some reason.

# Labels longer than this limit are demoted (not used as preferred label if a shorter alternative exists).
# Keyed by Biolink type; types not listed here are never demoted. Uses ancestor traversal, so
# biolink:ChemicalEntity applies to all chemical subtypes (SmallMolecule, Drug, etc.).
# See https://github.com/NCATSTranslator/Babel/issues/597
demote_labels_longer_than:
  biolink:ChemicalEntity: 25