PubChem API in R#

by Vishank Patel and Adam M. Nguyen

Documentation:

Pubchem API Documentation: https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access

These recipe examples were tested on March 24, 2023.

See the bottom of the document for information on R and package versions.

Attribution: This tutorial was adapted from supporting information in:

Scalfani, V. F.; Ralph, S. C. Alshaikh, A. A.; Bara, J. E. Programmatic Compilation of Chemical Data and Literature From PubChem Using Matlab. Chemical Engineering Education, 2020, 54, 230. https://doi.org/10.18260/2-1-370.660-115508 and vfscalfani/MATLAB-cheminformatics)

Setup#

Importing the necessary libraries and setting up the base api:

library(tidyverse)  #essential packages
library(dplyr)      #tibbles (R data_frames)
library(purrr)      #character manipulation 
library(httr)       #GET() API requests
library(jsonlite)   #converting to JSON
library(knitr)      #including graphics
library(imager)     #including images
library(magick)     #Image manipulation

# Create base URL for PubChem API
api <- 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/'

1. PubChem Similarity#

Search for chemical structures in PubChem via a Fingerprint Tanimoto Similarity Search.

Get compound image#

compoundID <- "2734162"
CID_URL <- paste0(api,"cid/",compoundID,"/PNG")  #paste0 concatenates strings 

include_graphics(CID_URL)

Replace the above CID value (CID_SS_query) with a different CID to customize.

Retrieve InChI and SMILES#

Retrieve InChI

inchi_url <- paste0(api,"cid/",compoundID,"/property/inchi/TXT")

raw_inchi <- rawToChar(GET(inchi_url)$content);   #"$content" filters the http response from the output and only returns the required output data 
inchi <- raw_inchi %>% gsub("\n","",.);           #"." refers to raw_inchi in gsub
inchi
## [1] "InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1"

Retrieve Isomeric SMILES

IS_url <- paste0(api,"cid/",compoundID,"/property/IsomericSMILES/TXT");

raw_IS <- rawToChar(GET(IS_url)$content);
IS <- raw_IS %>% gsub("\n","",.);
IS
## [1] "CCCCN1C=C[N+](=C1)C"

Retrieve Identifier and Property Data#

Create an identifier/property dataset from Similarity Search results.

Retrieve the following data from CID hit results: InChI, Isomeric SMILES, MW, Heavy Atom Count, Rotable Bond Count, and Charge

short_CIDs <- CIDs1_df$CID[1:25] #taking the first 25 CIDs from the similarity search results

#initializing the tibble
similarity_results_tibble <- tibble();
similarity_results_tibble <- add_column(similarity_results_tibble,
                             Compound_ID = "",
                             InChi = "",
                             IsoSMI = "",
                             MW = "",
                             Heavy_Atom_Count = "",
                             Rotatable_Bond_Count = "",
                             Charge = ""
                             );


for (CID in short_CIDs) {
  
  #define the api calls:
  api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
  CID_InChI_url = paste0(api,'cid/',toString(CID),'/property/InChI/TXT');
  CID_IsoSMI_url = paste0(api,'cid/',toString(CID),'/property/IsomericSMILES/TXT');
  CID_MW_url = paste0(api,'cid/',toString(CID),'/property/MolecularWeight/TXT');
  CID_HeavyAtomCount_url = paste0(api,'cid/',toString(CID),'/property/HeavyAtomCount/TXT');
  CID_RotatableBondCount_url = paste0(api,'cid/',toString(CID),'/property/RotatableBondCount/TXT');
  CID_Charge_url = paste0(api,'cid/',toString(CID),'/property/Charge/TXT');
  
  
  #downloading the data
  inchi_temp <- rawToChar(GET(CID_InChI_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)       # adding a delay for the PubChem server
  isoSMI_temp <- rawToChar(GET(CID_IsoSMI_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  mw_temp <- rawToChar(GET(CID_MW_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  heavy_atom_count_temp <- rawToChar(GET(CID_HeavyAtomCount_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  rotatable_bond_count_temp <- rawToChar(GET(CID_RotatableBondCount_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)
  charge_temp <- rawToChar(GET(CID_Charge_url)$content) %>% gsub("\n","",.);
  Sys.sleep(1)

  #Appending the data in a tibble
  similarity_results_tibble <- similarity_results_tibble %>%
    add_row(
      Compound_ID = toString(CID),
      InChi = inchi_temp,
      IsoSMI = isoSMI_temp,
      MW = mw_temp,
      Heavy_Atom_Count = heavy_atom_count_temp,
      Rotatable_Bond_Count = rotatable_bond_count_temp,
      Charge = charge_temp
    )

}

similarity_results_tibble
## # A tibble: 25 × 7
##    Compound_ID InChi                         IsoSMI MW    Heavy…¹ Rotat…² Charge
##    <chr>       <chr>                         <chr>  <chr> <chr>   <chr>   <chr> 
##  1 2734161     InChI=1S/C8H15N2.ClH/c1-3-4-… CCCCN… 174.… 11      3       0     
##  2 61347       InChI=1S/C7H12N2/c1-2-3-5-9-… CCCCN… 124.… 9       3       0     
##  3 529334      InChI=1S/C8H14N2/c1-2-3-4-6-… CCCCC… 138.… 10      4       0     
##  4 304622      InChI=1S/C8H14N2/c1-3-4-6-10… CCCCN… 138.… 10      3       0     
##  5 118785      InChI=1S/C6H10N2/c1-2-4-8-5-… CCCN1… 110.… 8       2       0     
##  6 12971008    InChI=1S/C7H13N2.HI/c1-3-4-9… CCCN1… 252.… 10      2       0     
##  7 11448496    InChI=1S/C8H15N2.HI/c1-3-4-5… CCCCN… 266.… 11      3       0     
##  8 11424151    InChI=1S/C8H15N2.CHNS/c1-3-4… CCCCN… 197.… 13      3       0     
##  9 11171745    InChI=1S/C8H15N2.C2N3/c1-3-4… CCCCN… 205.… 15      3       0     
## 10 11160028    InChI=1S/C7H13N2.BrH/c1-3-4-… CCCN1… 205.… 10      2       0     
## # … with 15 more rows, and abbreviated variable names ¹​Heavy_Atom_Count,
## #   ²​Rotatable_Bond_Count

We will now export the generated dataframe as a tab separated text file. The file will be saved in the present working directory.

write.table(similarity_results_tibble, file = "Data/R_Similarityq_results.txt", sep = "\t", row.names = TRUE, col.names = NA);

R Session Info#

sessionInfo()
## R version 4.2.1 (2022-06-23 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] magick_2.7.4    imager_0.42.18  magrittr_2.0.3  knitr_1.42     
##  [5] jsonlite_1.8.4  httr_1.4.5      lubridate_1.9.2 forcats_1.0.0  
##  [9] stringr_1.5.0   dplyr_1.1.0     purrr_1.0.1     readr_2.1.4    
## [13] tidyr_1.3.0     tibble_3.1.8    ggplot2_3.4.1   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.0 xfun_0.37        bslib_0.4.2      colorspace_2.1-0
##  [5] vctrs_0.5.2      generics_0.1.3   htmltools_0.5.4  yaml_2.3.7      
##  [9] utf8_1.2.3       rlang_1.0.6      jquerylib_0.1.4  pillar_1.8.1    
## [13] glue_1.6.2       withr_2.5.0      jpeg_0.1-10      lifecycle_1.0.3 
## [17] munsell_0.5.0    gtable_0.3.1     evaluate_0.20    tzdb_0.3.0      
## [21] fastmap_1.1.0    curl_5.0.0       fansi_1.0.4      highr_0.10      
## [25] Rcpp_1.0.10      scales_1.2.1     cachem_1.0.7     hms_1.1.2       
## [29] bmp_0.3          png_0.1-8        digest_0.6.31    stringi_1.7.12  
## [33] tiff_0.1-11      grid_4.2.1       cli_3.6.0        tools_4.2.1     
## [37] sass_0.4.5       readbitmap_0.1.5 pkgconfig_2.0.3  ellipsis_0.3.2  
## [41] timechange_0.2.0 rmarkdown_2.20   rstudioapi_0.14  R6_2.5.1        
## [45] igraph_1.4.1     compiler_4.2.1