PubChem API in C#

by Cyrus Gomes

PubChem API Documentation: https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access

These recipe examples were tested on July 25, 2023.

Attribution: This tutorial was adapted from supporting information in:

Scalfani, V. F.; Ralph, S. C. Alshaikh, A. A.; Bara, J. E. Programmatic Compilation of Chemical Data and Literature From PubChem Using Matlab. Chemical Engineering Education, 2020, 54, 230. https://doi.org/10.18260/2-1-370.660-115508 and vfscalfani/MATLAB-cheminformatics)

Setup#

First, install the CURL package by typing the following command in the terminal:

!sudo apt install curl jq libcurl4-openssl-dev

Then we set a directory where we want the PubChem directory for our projects to be created:

!mkdir Pub_Chem

Finally, we change the directory to the folder we created:

%cd Pub_Chem

1. PubChem Property#

Get property details#

Then we initialize a folder for the current project that we are working on. And then change to that directory

!mkdir Property
%cd Property

We utilize the %%file command to create the following makefile which will compile our program and create an executable.

%%file makefile

# Set the variable CC to gcc, which is used to build the program
CC=gcc

# Enable debugging information and enable all compiler warnings
CFLAGS=-g -Wall

# Set the bin variable as the name of the binary file we are creating
BIN=property_search

# Create the binary file with the name we put
all: $(BIN)

# Map any file ending in .c to a binary executable. 
# "$<" represents the .c file and "$@" represents the target binary executable
%: %.c

	# Compile the .c file using the gcc compiler with the CFLAGS and links 
	# resulting binary with the CURL library
	$(CC) $(CFLAGS) $< -o $@ -lcurl

# Clean target which removes specific files
clean:

	# Remove the binary file and an ".dSYM" (debug symbols for debugging) directories
	# the RM command used -r to remove directories and -f to force delete
	$(RM) -rf $(BIN) *.dSYM

The command is used again to create our .c file which contains the code for the program

%%file property_search.c

#include <curl/curl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* CURL program that retrieves property details about the CID 
and outputs to terminal. Custom property fields can be added */

int main (int argc, char* argv[]) {
    
    // If arguments are invalid then return
    if (argc < 2){                                                                                      
        printf("Error. Please try again correctly.\n");
        return -1;
    }

    // Initialize the CURL HTTP connection
    CURL *curl = curl_easy_init();

    // Bits of the url that are joined together later                                                                      
    char api[] = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/";                            
    char url[1000];
    char label_1[] = "/property/";
    char format[] = "/JSON";

    // Check if CURL initialization is a success or not
    if (!curl) {                                                                                         
        fprintf(stderr, "init failed\n");
        return EXIT_FAILURE;
    }

    // Check if the conditions match for using the default property
    if ((argc==2)||((argc==3) && (strcmp(argv[2],"-p")==0))) {
        char search_type[] = "/property/inchi,IsomericSMILES,MolecularFormula,MolecularWeight/JSON";
        
        // Combine all the bits to produce a functioning url
        sprintf(url, "%s%s%s", api, argv[1], search_type);                                              
    
    }

    // Check if the conditions match for using custom property
    else if ((argc==4)&&(strcmp(argv[2],"-p")==0)) {                                                     

        // Combine all the bits to produce a functioning url
        sprintf(url, "%s%s%s%s%s", api, argv[1], label_1, argv[3], format);                             
    
    }

    // If the arguments are invalid then return
    else {                                                                                              
        curl_easy_cleanup(curl);
        return 0;
    }                                            

    // Set the url to which the HTTP request will be sent to
    // first parameter is for the initialized curl HTTP request, second for the option to be set, and third for the value to be set
    curl_easy_setopt(curl, CURLOPT_URL, url);

    // If result is not retrieved then output error
    CURLcode result = curl_easy_perform(curl);

    // If result is not retrieved then output error
    if (result != CURLE_OK) {                                                                            
        fprintf(stderr, "download problem: %s\n", curl_easy_strerror(result));
    }

    // Deallocate memory for the CURL connection
    curl_easy_cleanup(curl);                                                                            
    return EXIT_SUCCESS;
}

The folowing program is run, and an executable is created after using the following command:

!make

We can search for a compound and display an image, for example: 1-Butyl-3-methyl-imidazolium; CID = 2734162

If we run the executable and enter the CID and the custom properties that we want to add, we get the result:

!./property_search 2734162 -p "inchi"
{
  "PropertyTable": {
    "Properties": [
      {
        "CID": 2734162,
        "InChI": "InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1"
      }
    ]
  }
}

We can add additional properties as follows:

!./property_search 2734162 -p "inchi,XLogP,HBondDonorCount,HBondAcceptorCount,RotatableBondCount"
{
  "PropertyTable": {
    "Properties": [
      {
        "CID": 2734162,
        "InChI": "InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1",
        "XLogP": 1.3,
        "HBondDonorCount": 0,
        "HBondAcceptorCount": 0,
        "RotatableBondCount": 3
      }
    ]
  }
}

The following command is used to output the default fields (inchi,IsomericSMILES,MolecularFormula,MolecularWeight):

!./property_search 2734162
{
  "PropertyTable": {
    "Properties": [
      {
        "CID": 2734162,
        "MolecularFormula": "C8H15N2+",
        "MolecularWeight": "139.22",
        "IsomericSMILES": "CCCCN1C=C[N+](=C1)C",
        "InChI": "InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1"
      }
    ]
  }
}

The following command is used to output the SMILES with jq:

# Get SMILES with jq
!./property_search 2734162 | jq '.["PropertyTable"]["Properties"][0]["IsomericSMILES"]'
"CCCCN1C=C[N+](=C1)C"

2. PubChem Compound Image#

Download image of the requested CID#

We change the directory of the Pub_Chem folder to create a new one for our project

%cd ..
!mkdir Image
%cd Image
%%file makefile

# Set the variable CC to gcc, which is used to build the program
CC=gcc

# Enable debugging information and enable all compiler warnings
CFLAGS=-g -Wall

# Set the bin variable as the name of the binary file we are creating
BIN=image_download

# Create the binary file with the name we put
all: $(BIN)

# Map any file ending in .c to a binary executable. 
# "$<" represents the .c file and "$@" represents the target binary executable
%: %.c

	# Compile the .c file using the gcc compiler with the CFLAGS and links 
	# resulting binary with the CURL library
	$(CC) $(CFLAGS) $< -o $@ -lcurl

# Clean target which removes specific files
clean:

	# Remove the binary file and an ".dSYM" (debug symbols for debugging) directories
	# the RM command used -r to remove directories and -f to force delete
	$(RM) -rf $(BIN) *.dSYM
%%file image_download.c

#include <curl/curl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

/* This code was adapted from https://stackoverflow.com/questions/10112959/download-an-image-from-server-curl-however-taking-suggestions-c
and modified to download the pubchem images */

// Download custom CID image in a .png format

// Retrieve the file data from the URL and writes them into the file
size_t callbackfunction(void *ptr, size_t size, size_t nmemb, void* userdata) {
    // Declare a file stream used to hold data
    FILE* stream = (FILE*)userdata;

    // Check if a stream is detected to write into the file
    if (!stream) {
        printf("!!! No stream\n");
        return 0;
    }

    // Retrieve the size of the data to be downloaded
    size_t written = fwrite((FILE*)ptr, size, nmemb, stream);
    return written;
}

// Retrieve the image result and checks whether it is found or not
bool download_png(char* url, char name[]) {
    // Combine the name and the .txt and creates the following file
    strcat(name, ".png");
    FILE* fp = fopen(name, "wb");

    // If file is not created abort the system
    if (!fp) {
        printf("!!! Failed to create file on the disk\n");                                     
        return false;
    }

    // Initialize the CURL connection
    CURL* curlCtx = curl_easy_init();                                                           
    
    // If initialization does not work then error
    if (!curlCtx) {                                                                              
        fprintf(stderr, "init failed\n");
        return EXIT_FAILURE;
    }

    // Set the url to which the HTTP request will be sent to
    // first parameter is for the initialized curl HTTP request, second for the option to be set, and third for the value to be set
    curl_easy_setopt(curlCtx, CURLOPT_URL, url);

    // Set the data pointer for writing the response body of the HTTP request
    // The third parameter is a pointer to the file where the response data will be written.
    curl_easy_setopt(curlCtx, CURLOPT_WRITEDATA, fp);

    // Set the callback function which is called by libcurl for the response body of the HTTP request
    curl_easy_setopt(curlCtx, CURLOPT_WRITEFUNCTION, callbackfunction);
    
    // Set the option to enable HTTP redirects
    // For the third parameter the value of 1L enables following of HTTP redirects, and a value of 0L disables it.
    curl_easy_setopt(curlCtx, CURLOPT_FOLLOWLOCATION, 1);

    // Perform an HTTP rquest
    CURLcode rc = curl_easy_perform(curlCtx);                                                       
    
    // If request is unsuccessful then abort   
    if (rc) {
        printf("!!! Failed to download: %s\n", url);
        return false;
    }

    long res_code = 0;

    // Set the resposnse code retrieved from the HTTP website                  
    curl_easy_getinfo(curlCtx, CURLINFO_RESPONSE_CODE, &res_code);
    
    // Deallocate memory for the CURL connection
    curl_easy_cleanup(curlCtx);                                                                     

    // Avoid memory leaks by closing file pointer   
    fclose(fp);

    return true;
}

int main(int argc, char* argv[]) {
    // If arguments are lower than or greater than 2 then error
    if (argc < 2 || argc > 2) {
        printf("Error. Please try again correctly");
        return 0;
    }

    // Bits of data required for the API search
    char api[] = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound//cid/";
    char type[] = "/PNG";
    char url[1000];

    // Combine all the bits together to create the final URL
    sprintf(url, "%s%s%s", api, argv[1], type);                                                     

    // If image not found retrieve error
    if (!download_png(url, argv[1])) {
        printf("!! Failed to download file \n");
        return -1;
    }

    return 0;
}
!make

We can change the CID to our own preference to download images

!./image_download 2734162