Search Environment Data • dmcognigen

library(dmcognigen)
library(stringr)
library(dplyr)

search_environment_data() searches for patterns in variable names, variable labels, and variable content. cnt_search_result() summarizes the matching results. These functions are particularly useful when the source of some information is expected to exist, but is unknown. The default envir is the global environment, but other environments and named lists that include data frames are also accepted.

To demonstrate, load all of the pharmaversesdtm datasets into an environment.

pharmaversesdtm_data_env <- new.env()

data(
  list = data(package = "pharmaversesdtm")$results[ , "Item"], 
  package = "pharmaversesdtm", 
  envir = pharmaversesdtm_data_env
)

Then search for a pattern:

search_environment_data(
  regex("XANOMELINE", ignore_case = TRUE),
  envir = pharmaversesdtm_data_env
)
#> 
#> ── Search results ──────────────────────────────────────────────────────────────
#> 
#> ── dm ──
#> 
#> ✔ Variable content: ARM and ACTARM
#> 
#> ── dm_peds ──
#> 
#> ✔ Variable content: ARM and ACTARM
#> 
#> ── ex ──
#> 
#> ✔ Variable content: EXTRT
#> 
#> ── ex_ophtha ──
#> 
#> ✔ Variable content: EXTRT
#> 
#> ── pc ──
#> 
#> ✔ Variable content: PCTEST
#> 
#> ── pp ──
#> 
#> ✔ Variable content: PPCAT
#> 
#> ── ts ──
#> 
#> ✔ Variable content: TSVAL

To immediately review the results that matched the pattern, pass to cnt_search_result().

search_environment_data(
  regex("XANOMELINE", ignore_case = TRUE),
  envir = pharmaversesdtm_data_env
) %>% 
  cnt_search_result(
    n_distinct_vars = "USUBJID"
  )
#> $dm
#> # A tibble: 3 × 4
#>   ARM                  ACTARM               n_USUBJID     n
#>   <chr>                <chr>                    <int> <int>
#> 1 Xanomeline High Dose Xanomeline High Dose        72    72
#> 2 Xanomeline High Dose Xanomeline Low Dose         12    12
#> 3 Xanomeline Low Dose  Xanomeline Low Dose         84    84
#> 
#> $dm_peds
#> # A tibble: 2 × 4
#>   ARM                  ACTARM               n_USUBJID     n
#>   <chr>                <chr>                    <int> <int>
#> 1 Xanomeline High Dose Xanomeline High Dose         2     2
#> 2 Xanomeline Low Dose  Xanomeline Low Dose          1     1
#> 
#> $ex
#> # A tibble: 1 × 3
#>   EXTRT      n_USUBJID     n
#>   <chr>          <int> <int>
#> 1 XANOMELINE       168   365
#> 
#> $ex_ophtha
#> # A tibble: 1 × 3
#>   EXTRT      n_USUBJID     n
#>   <chr>          <int> <int>
#> 1 XANOMELINE       168   365
#> 
#> $pc
#> # A tibble: 1 × 3
#>   PCTEST     n_USUBJID     n
#>   <chr>          <int> <int>
#> 1 XANOMELINE       254  4572
#> 
#> $pp
#> # A tibble: 1 × 3
#>   PPCAT      n_USUBJID     n
#>   <chr>          <int> <int>
#> 1 XANOMELINE       168  2688
#> 
#> $ts
#> # A tibble: 3 × 2
#>   TSVAL                                                                        n
#>   <chr>                                                                    <int>
#> 1 "Safety and Efficacy of the Xanomeline Transdermal Therapeutic System (…     1
#> 2 "To document the safety profile of the xanomeline TTS."                      1
#> 3 "Xanomeline"                                                                 1

One way to further collect and review the results is to bind_rows().

search_environment_data(
  regex("STUDYID", ignore_case = TRUE),
  envir = pharmaversesdtm_data_env
) %>% 
  cnt_search_result(
    n_distinct_vars = "USUBJID"
  ) %>% 
  bind_rows(.id = "dataset")
#> # A tibble: 44 × 4
#>    dataset    STUDYID      n_USUBJID     n
#>    <chr>      <chr>            <int> <int>
#>  1 ae         CDISCPILOT01       225  1191
#>  2 ae_ophtha  CDISCPILOT01       225  1191
#>  3 ce_vaccine ABC                  2    44
#>  4 cm         CDISCPILOT01       229  7510
#>  5 dm         CDISCPILOT01       306   306
#>  6 dm_peds    CDISCPILOT01         5     5
#>  7 dm_vaccine ABC                  2     2
#>  8 ds         CDISCPILOT01       306   850
#>  9 eg         CDISCPILOT01       254 26717
#> 10 ex         CDISCPILOT01       254   591
#> # ℹ 34 more rows

Optional arguments can be used to include/exclude datasets/variables.

search_environment_data(
  regex("cancer", ignore_case = TRUE),
  envir = pharmaversesdtm_data_env
) %>% 
  cnt_search_result(
    n_distinct_vars = "USUBJID",
    ignore_df_names = c("ae", "ae_ophtha"),
    extra_vars = c("STUDYID")
  )
#> $mh
#> # A tibble: 9 × 5
#>   MHLLT                             MHDECOD              STUDYID n_USUBJID     n
#>   <chr>                             <chr>                <chr>       <int> <int>
#> 1 BREAST CANCER                     BREAST CANCER        CDISCP…         3     3
#> 2 CARCINOMA PROSTATE                PROSTATE CANCER      CDISCP…         1     1
#> 3 CARCINOMA SKIN                    SKIN CANCER          CDISCP…         1     1
#> 4 MALIGNANT NASOPHARYNGEAL NEOPLASM NASOPHARYNGEAL CANC… CDISCP…         1     1
#> 5 ORAL CANCER STAGE UNSPECIFIED     LIP AND/OR ORAL CAV… CDISCP…         1     1
#> 6 PROSTATE CANCER                   PROSTATE CANCER      CDISCP…         3     3
#> 7 PROSTATIC CARCINOMA               PROSTATE CANCER      CDISCP…         1     1
#> 8 SKIN CARCINOMA                    SKIN CANCER          CDISCP…         1     1
#> 9 THYROID CARCINOMA                 THYROID GLAND CANCER CDISCP…         1     1
#> 
#> $smq_db
#> # A tibble: 3 × 2
#>   termchar                         n
#>   <chr>                        <int>
#> 1 Bile duct cancer recurrent       2
#> 2 Gallbladder cancer               2
#> 3 Gallbladder cancer recurrent     2
#> 
#> $supprs_onco_imwg
#> # A tibble: 1 × 4
#>   QLABEL                       STUDYID      n_USUBJID     n
#>   <chr>                        <chr>            <int> <int>
#> 1 New Anti-Cancer Therapy Date CDISCPILOT01         3     9