-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpipeline.conf
More file actions
89 lines (80 loc) · 3.27 KB
/
Copy pathpipeline.conf
File metadata and controls
89 lines (80 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# =====================================================================================
# DrugMesh pipeline configuration (HOCON, loaded with PureConfig).
#
# Replaces the toggled booleans hard-coded in the original CreateDrugMappings.main().
# Each enrichment pass is enabled/ordered declaratively here instead of by commenting
# lines in/out of Scala source.
# =====================================================================================
drugmesh {
spark {
app-name = "drugmesh"
master = "local[*]" # override with spark-submit on a cluster
shuffle-partitions = 64
# Adaptive Query Execution: runtime partition coalescing + skew-join handling.
adaptive-enabled = true
}
io {
# Cached source files (download once, re-read many times).
drugbank-xml = "data/sources/drugbank_full_database.xml"
ttd-raw = "data/sources/ttd_drug_download.txt"
kegg-drug = "data/sources/kegg_drug.txt"
stitch-sider = "data/sources/drugbank_sider_mapping.tsv"
dgidb-drugs = "data/sources/dgidb_drugs.tsv"
# Output — diff-matches the original 13-column drug-mappings.tsv contract.
output-tsv = "out/drug-mappings.tsv"
# API responses are cached to Parquet so reruns don't re-hit rate-limited endpoints.
api-cache-dir = "cache/api"
}
# Ordered enrichment DAG. Each pass is a typed Dataset[DrugEntry] => Dataset[DrugEntry].
# Toggle/reorder without recompiling.
enrichment {
passes = [
"drugbank-base", # seed from DrugBank XML
"ttd-merge", # merge Therapeutic Target Database ids
"chembl-unichem", # UniChem -> ChEMBL ids
"pubchem-pugrest", # PubChem CIDs
"kegg-file", # KEGG drug ids/cids
"umls-cuis", # UMLS CUIs (Metathesaurus REST)
"stitch-sider", # STITCH ids via DrugBank-SIDER mapping
"dgidb" # DGIdb cross-check
]
}
clients {
# Rate limiting + retry/backoff for the external APIs.
user-agent = "DrugMesh/0.1 (research; contact: you@example.org)"
max-retries = 4
base-backoff = "500 millis"
requests-per-sec = 5
chembl-base = "https://www.ebi.ac.uk/unichem/rest"
pubchem-base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
dgidb-base = "https://dgidb.org/api/v2"
umls {
# UMLS requires a (free) license + API key. Never commit it; load from env/secrets.
base = "https://uts-ws.nlm.nih.gov/rest"
auth-base = "https://utslogin.nlm.nih.gov"
api-key = ${?UMLS_API_KEY}
}
}
ml {
entity-resolution {
# Blocking key + comparison thresholds for the Fellegi-Sunter matcher.
blocking-keys = [ "normalized-name-prefix", "inchikey-prefix" ]
match-threshold = 0.90 # posterior match probability above which a pair links
review-lower = 0.45 # below match-threshold but above this => clerical review
}
anomaly {
contamination = 0.02 # expected fraction of dirty rows
num-estimators = 100
max-samples = 256
}
search {
es-nodes = "localhost"
es-port = 9200
index-name = "drug-mappings"
}
embeddings {
model = "sbiobert_base_cased_mli" # Spark NLP sentence-BERT
dimension = 768
}
}
}