package search;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.HashMap;
import java.util.TreeSet;

import chemaxon.formats.MolFormatException;
import chemaxon.sss.search.MolSearch;
import chemaxon.sss.search.SearchException;
import chemaxon.sss.search.StandardizedMolSearch;
import chemaxon.struc.Molecule;
import chemaxon.sss.SearchConstants;
import chemaxon.sss.screen.HashCode;
import chemaxon.util.IntVector;

/**
 * Running various types of duplicate search:
 *  - comparing every pair of molecules
 *  - comparing smiles format of molecules
 *  - comparing based on hash-code comparison
 * 
 * @author Robert Wagner
 * @version 5.0.3, 04/26/2008
 * 
 */
public class DuplicateSearch {

    static IntVector duplicates = null;
    
    /** Imports molecules from Util.standardInputFile()
     * (1000 structures from nci db.) and 
     * carries out three types of duplicate search on it. 
     * 
     * @param args not used
     */
    public static void main(String[] args) {
	try {
	    System.out.println("Reading molecules.");
	    Molecule[] mols = MoleculeImport.moleculeArrayImport(
		    Util.standardInputFile());
	    for (int i = 0; i < mols.length; i++) {
		mols[i].aromatize();	// needed for search!
	    }

	    // various duplicate search methods
	    searchForDuplicates(mols); 
	    searchForDuplicatesUniqueSmiles(mols);
	    searchForDuplicatesHash(mols);

	}
	catch (SearchException e) {
	    System.out.println("Error during duplicate searching.");
	    e.printStackTrace();
	}
	catch (MolFormatException e) {
	    System.out.println("Bad structures in input file.");
	    e.printStackTrace();
	}
	catch (FileNotFoundException e) {
	    System.out.println("Input file couldn't be found");
	    e.printStackTrace();
	}
	catch (IOException e) {
	    System.out.println("I/O error during molecule import.");
	    e.printStackTrace();
	}
    }

    /**
     * Uses duplicate search to compare every pairs of molecules.
     * @param mols Molecule array to search for duplicates
     * @throws SearchException Error during duplicate searching.
     */
    public static void searchForDuplicates(
	    Molecule[] mols) 
    	throws SearchException {

	StandardizedMolSearch stms = new StandardizedMolSearch();
	stms.setSearchType(SearchConstants.DUPLICATE);
	long start = System.currentTimeMillis();
	System.out.println("\nSearching for duplicates.\n\tMatching IDs");
	int num=0;
//	for (int q=0;q<mols.length;q++) 
//	    for (int t=q+1;t<mols.length;t++) {
	for (int q=0;q<mols.length;q++) { 
	    for (int t=0;t<q;t++) {	    
		stms.setQuery(mols[q]);
		stms.setTarget(mols[t]);
		if (stms.isMatching()) {
		    System.out.println("\t"+(q+1)+" matches on "+(t+1));
		    num++;
		    break;
		}
	    }
	}
	System.out.println("Found "+num+" duplicates in "  +
		(System.currentTimeMillis()-start) +
		" milliseconds");
    }

    /** 
     * Searches for duplicates based on comparison of the molecules' unique
     * SMILES representation. 
     * @param mols molecules to search
     */
    private static void searchForDuplicatesUniqueSmiles(Molecule[] mols) {
	long start = System.currentTimeMillis();

	System.out.println("\nSearching for duplicates" +
		" based on smiles string comparison.\n\tMatching IDs");
	int num=0;
	TreeSet smilesTree=new TreeSet(); //for faster searching
	
	duplicates = new IntVector();

	String[] smiles = new String[mols.length];
	for (int i=0;i<mols.length;i++) { 
	    smiles[i] = mols[i].toFormat("smiles:u"); // create unique smiles
	    if (!smilesTree.contains(smiles[i])) {
		smilesTree.add(smiles[i]);
	    }
	    else { // process, if already contained
		    num++;
		System.out.println("\t"+(i+1) + " is duplicate.");
		duplicates.add(i+1);
	    }
	}
	System.out.println("Found "+num+" duplicates in "  +
		(System.currentTimeMillis()-start) +
	" milliseconds");
    }

    /** 
     * Searches for duplicates based on the comparison 
     * of the molecules' hash code.
     * The equivalence of the hash codes doesn't imply 
     * a structural equivalence, so molecules with similar hash code 
     * should still be matched in structure. 
     * @param mols the molecules to search. 
     * @throws SearchException Error during duplicate searching.
     */
    public static void searchForDuplicatesHash(Molecule[] mols) 
    	throws SearchException {
	
	MolSearch ms = new MolSearch(); 
	long start = System.currentTimeMillis();
	HashCode hc = new HashCode();

	int[] codes = new int[mols.length];
	for (int i=0;i<mols.length;i++) { 
	    codes[i] = hc.getHashCode(mols[i]);//generate hash code
	}

	System.out.println("\nSearching for duplicates" +
		" based on hash-code comparison " +
		"\n and subsequent searching\n\tMatching IDs");
	int num=0;
	for (int q=0;q<mols.length;q++) {
	    for (int t=0;t<q;t++) {	    
//	    for (int t=q+1;t<mols.length;t++) {
		if (codes[q]==codes[t]) {// if hash-codes are equal
		    			 // check with structure searching
		    ms.setQuery(mols[q]);
		    ms.setTarget(mols[t]);
		    if (ms.isMatching()) {
			System.out.println("\t"+(q+1)+" matches on "+(t+1));
			num++;
			if (!duplicates.contains(q+1)) {
				System.out.println(
					"\tNot found with unique SMILES:"
					+ (q+1) + "  " + (t+1));
			}
			break;
		    }
		}
	    }
	}
	System.out.println("Found "+num+" duplicates in "  +
		(System.currentTimeMillis()-start) +
	" milliseconds");
    }
}

Do you have a question? Would you like to learn more? Please browse among the related topics on our support forum or search the website. If you want to suggest modifications or improvements to our documentation email our support directly!