DrugMesh/conf/pipeline.conf at main · rbr7/DrugMesh · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# =====================================================================================
# DrugMesh pipeline configuration (HOCON, loaded with PureConfig).
#
# Replaces the toggled booleans hard-coded in the original CreateDrugMappings.main().
# Each enrichment pass is enabled/ordered declaratively here instead of by commenting
# lines in/out of Scala source.
# =====================================================================================

drugmesh {

  spark {
    app-name = "drugmesh"
    master   = "local[*]"          # override with spark-submit on a cluster
    shuffle-partitions = 64
    # Adaptive Query Execution: runtime partition coalescing + skew-join handling.
    adaptive-enabled = true
  }

  io {
    # Cached source files (download once, re-read many times).
    drugbank-xml   = "data/sources/drugbank_full_database.xml"
    ttd-raw        = "data/sources/ttd_drug_download.txt"
    kegg-drug      = "data/sources/kegg_drug.txt"
    stitch-sider   = "data/sources/drugbank_sider_mapping.tsv"
    dgidb-drugs    = "data/sources/dgidb_drugs.tsv"

    # Output — diff-matches the original 13-column drug-mappings.tsv contract.
    output-tsv     = "out/drug-mappings.tsv"
    # API responses are cached to Parquet so reruns don't re-hit rate-limited endpoints.
    api-cache-dir  = "cache/api"
  }

  # Ordered enrichment DAG. Each pass is a typed Dataset[DrugEntry] => Dataset[DrugEntry].
  # Toggle/reorder without recompiling.
  enrichment {
    passes = [
      "drugbank-base",     # seed from DrugBank XML
      "ttd-merge",         # merge Therapeutic Target Database ids
      "chembl-unichem",    # UniChem -> ChEMBL ids
      "pubchem-pugrest",   # PubChem CIDs
      "kegg-file",         # KEGG drug ids/cids
      "umls-cuis",         # UMLS CUIs (Metathesaurus REST)
      "stitch-sider",      # STITCH ids via DrugBank-SIDER mapping
      "dgidb"              # DGIdb cross-check
    ]
  }

  clients {
    # Rate limiting + retry/backoff for the external APIs.
    user-agent       = "DrugMesh/0.1 (research; contact: you@example.org)"
    max-retries      = 4
    base-backoff     = "500 millis"
    requests-per-sec = 5

    chembl-base  = "https://www.ebi.ac.uk/unichem/rest"
    pubchem-base = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
    dgidb-base   = "https://dgidb.org/api/v2"

    umls {
      # UMLS requires a (free) license + API key. Never commit it; load from env/secrets.
      base       = "https://uts-ws.nlm.nih.gov/rest"
      auth-base  = "https://utslogin.nlm.nih.gov"
      api-key    = ${?UMLS_API_KEY}
    }
  }

  ml {
    entity-resolution {
      # Blocking key + comparison thresholds for the Fellegi-Sunter matcher.
      blocking-keys  = [ "normalized-name-prefix", "inchikey-prefix" ]
      match-threshold = 0.90      # posterior match probability above which a pair links
      review-lower    = 0.45      # below match-threshold but above this => clerical review
    }
    anomaly {
      contamination   = 0.02      # expected fraction of dirty rows
      num-estimators  = 100
      max-samples     = 256
    }
    search {
      es-nodes       = "localhost"
      es-port        = 9200
      index-name     = "drug-mappings"
    }
    embeddings {
      model          = "sbiobert_base_cased_mli"   # Spark NLP sentence-BERT
      dimension      = 768
    }
  }
}