PubChem API in Matlab#

by Vincent F. Scalfani and Anastasia Ramig

These recipe examples were tested on November 23, 2022 using Matlab R2022b.

PubChem API Documentation: https://pubchemdocs.ncbi.nlm.nih.gov/programmatic-access

Attribution: This tutorial was adapted from supporting information in:

Scalfani, V. F.; Ralph, S. C. Alshaikh, A. A.; Bara, J. E. Programmatic Compilation of Chemical Data and Literature From PubChem Using Matlab. Chemical Engineering Education, 2020, 54, 230. https://doi.org/10.18260/2-1-370.660-115508 and vfscalfani/MATLAB-cheminformatics)

Setup#

Define the PubChem PUG-REST API base URL:

%% set API URL
api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';

%% set MATLAB web options to a 30 second timeout
options = weboptions('Timeout', 30);

1. PubChem Similarity#

Get Compound Image#

We can search for a compound and display an image, for example: 1-Butyl-3-methyl-imidazolium; CID = 2734162.

%% Retrieve and display PNG Image of 1-Butyl-3-methyl-imidazolium; CID = 2734162
CID_SS_query = '2734162';

CID_url = [api 'cid/' CID_SS_query '/PNG'];
[CID_img,map] = imread(CID_url);
imshow(CID_img,map)

Output:

../../_images/matlab_pc_im0.png

Replace the above CID value (CID_SS_query) with a different CID to customize.

Retrieve InChI and SMILES#

%% Retrieve InChI
inchi_url = [api 'cid/' CID_SS_query '/property/inchi/TXT'];
inchi = webread(inchi_url, options);
disp(inchi)

Output:

InChI=1S/C8H15N2/c1-3-4-5-10-7-6-9(2)8-10/h6-8H,3-5H2,1-2H3/q+1
%% Retrieve Isomeric SMILES
IS_url = [api 'cid/' CID_SS_query '/property/IsomericSMILES/TXT'];
IS = webread(IS_url, options);
disp(IS)

Output:

CCCCN1C=C[N+](=C1)C

Retrieve Identifier and Property Data#

Get the following data for the retrieved CIDs (SS_CIDs): InChI, Isomeric SMILES, MW, Heavy Atom Count, Rotable Bond Count, and Charge.

%% Create an identifier/property dataset from Similarity Search results
%% Retrieve the following data from CID hit results:
%% InChI, Isomeric SMILES, MW, Heavy Atom Count, Rotable Bond Count, and
%% Charge
%% setup a for loop that processes each CID one-by-one
for r = 1:length(SS_CIDs)
    CID = SS_CIDs{r};

    %% define api calls
    api = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/';
    CID_InChI_url = [api 'cid/' num2str(CID) '/property/InChI/TXT'];
    CID_IsoSMI_url = [api 'cid/' num2str(CID) '/property/IsomericSMILES/TXT'];
    CID_MW_url = [api 'cid/' num2str(CID) '/property/MolecularWeight/TXT'];
    CID_HeavyAtomCount_url = [api 'cid/' num2str(CID) '/property/HeavyAtomCount/TXT'];
    CID_RotatableBondCount_url = [api 'cid/' num2str(CID) '/property/RotatableBondCount/TXT'];
    CID_Charge_url = [api 'cid/' num2str(CID) '/property/Charge/TXT'];
    %% CID_TPSA_url = [api 'cid/' num2str(CID) '/property/TPSA/TXT'];
    %% retrieve identifer and property data
    try
        CID_InChI = webread(CID_InChI_url,options);
    catch ME
        CID_InChI = 'not found'
    end
        % be polite to PubChem server
        n = 0.5;
        pause(n)

    try
        CID_IsoSMI = webread(CID_IsoSMI_url,options);
    catch ME
        CID_IsoSMI = 'not found'
    end
        n = 0.5;
        pause(n)

    try
        CID_MW = webread(CID_MW_url,options);
    catch ME
        CID_MW = 'not found'
    end
        n = 0.5;
        pause(n)

    try
        CID_HeavyAtomCount = webread(CID_HeavyAtomCount_url,options);
    catch ME
        CID_HeavyAtomCount = 'not found'
    end
        n = 0.5;
        pause(n)

    try
        CID_RotatableBondCount = webread(CID_RotatableBondCount_url,options);
    catch ME
        CID_RotatableBondCount = 'not found'
    end
        n = 0.5;
        pause(n)

    try
        CID_Charge = webread(CID_Charge_url,options);
    catch ME
        CID_Charge = 'not found'
    end
        n = 0.5;
        pause(n)

      %% add property data to SS_CIDs data array

      %% column numbers indicate where the data will be stored.
      %% For example, the MW will be placed in column 4. r increases
      %% by 1 on each iteration, so the first CID_MW value gets stored in
      %% {1,4}, the second in {2,4}, the third in {3,4}, etc.
        SS_CIDs{r,2} = CID_InChI;
        SS_CIDs{r,3} = CID_IsoSMI;
        SS_CIDs{r,4} = CID_MW;
        SS_CIDs{r,5} = CID_HeavyAtomCount;
        SS_CIDs{r,6} = CID_RotatableBondCount;
        SS_CIDs{r,7} = CID_Charge;

        %% to add more data, simply index into the next column
        %% SS_CIDs{r,8} = CID_TPSA;

end

Compile Data into a Table#

We can display the data as a table:

%% convert cell array to string and remove leading and trailing white space
SS_CIDs_string = strtrim(string(SS_CIDs));
%% convert to table
SSq_table = array2table(SS_CIDs_string, 'VariableNames',{'CID', 'InChI','IsoSMI','MW',...
    'HeavyAtomCount','RotatableBondCount','Charge'});
%% rearrange table
SSq_table2 = SSq_table(:, {'IsoSMI' 'CID' 'InChI' 'MW' 'HeavyAtomCount' 'RotatableBondCount' 'Charge'});
disp(SSq_table2(1:10,:)) %% display first 10

Output:

              IsoSMI                      CID                                               InChI                                               MW       HeavyAtomCount    RotatableBondCount    Charge
___________________________________    __________    ____________________________________________________________________________________    ________    ______________    __________________    ______

"CCCCN1C=C[N+](=C1)C.[Cl-]"            "2734161"     "InChI=1S/C8H15N2.ClH/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1"          "174.67"         "11"                "3"             "0"
"CCCCN1C=CN=C1"                        "61347"       "InChI=1S/C7H12N2/c1-2-3-5-9-6-4-8-7-9/h4,6-7H,2-3,5H2,1H3"                             "124.18"         "9"                 "3"             "0"
"CCCCCN1C=CN=C1"                       "529334"      "InChI=1S/C8H14N2/c1-2-3-4-6-10-7-5-9-8-10/h5,7-8H,2-4,6H2,1H3"                         "138.21"         "10"                "4"             "0"
"CCCCN1C=CN=C1C"                       "304622"      "InChI=1S/C8H14N2/c1-3-4-6-10-7-5-9-8(10)2/h5,7H,3-4,6H2,1-2H3"                         "138.21"         "10"                "3"             "0"
"CCCN1C=CN=C1"                         "118785"      "InChI=1S/C6H10N2/c1-2-4-8-5-3-7-6-8/h3,5-6H,2,4H2,1H3"                                 "110.16"         "8"                 "2"             "0"
"CCCN1C=C[N+](=C1)C.[I-]"              "12971008"    "InChI=1S/C7H13N2.HI/c1-3-4-9-6-5-8(2)7-9;/h5-7H,3-4H2,1-2H3;1H/q+1;/p-1"               "252.10"         "10"                "2"             "0"
"CCCCN1C=C[N+](=C1)C.[I-]"             "11448496"    "InChI=1S/C8H15N2.HI/c1-3-4-5-10-7-6-9(2)8-10;/h6-8H,3-5H2,1-2H3;1H/q+1;/p-1"           "266.12"         "11"                "3"             "0"
"CCCCN1C=C[N+](=C1)C.C(#N)[S-]"        "11424151"    "InChI=1S/C8H15N2.CHNS/c1-3-4-5-10-7-6-9(2)8-10;2-1-3/h6-8H,3-5H2,1-2H3;3H/q+1;/p-1"    "197.30"         "13"                "3"             "0"
"CCCCN1C=C[N+](=C1)C.C(=[N-])=NC#N"    "11171745"    "InChI=1S/C8H15N2.C2N3/c1-3-4-5-10-7-6-9(2)8-10;3-1-5-2-4/h6-8H,3-5H2,1-2H3;/q+1;-1"    "205.26"         "15"                "3"             "0"
"CCCN1C=C[N+](=C1)C.[Br-]"             "11160028"    "InChI=1S/C7H13N2.BrH/c1-3-4-9-6-5-8(2)7-9;/h5-7H,3-4H2,1-2H3;1H/q+1;/p-1"              "205.10"         "10"                "2"             "0