Program code for building TDM

*************************************************************************
Dear Reader,

All the post in shakthydoss.wordpress.com have been moved to shakthydoss.com

shakthydoss.wordpress.com is no longer functioning. To get the latest updates and follow up your comments please come to shakthydoss.com and get subscribed.

Thank you
shakthydoss

**************************************************************************

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

/**
*
* @author shakthydoss
*/
public class ReadingMultipleFile {
  public static  List keywordList = new  ArrayList();
  public static  int [][] countMatrix;
  static String path = "E:\\Colloge  studies\\SEM – 7\\Text Mining\\Assignments\\Assignment -7\\Corpus2";
//  static String path = "Corpus2";
  public static File folder = new File(path);
  public static File[] listOfFiles = folder.listFiles();

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws FileNotFoundException, IOException {

        String s ,temp;
        StringTokenizer st ;
        int countKeyword =0;

        Mystemmer stem = new Mystemmer();
       //  List keywordList = new  ArrayList();

        StopWordList swl = new StopWordList();

        BufferedWriter bw = new BufferedWriter(new FileWriter("E:\\Colloge  studies\\SEM – 7\\Text Mining\\Assignments\\Assignment -7\\keywords.txt"));

          if (listOfFiles.length > 0)
          {
               for (int i = 0; i < listOfFiles.length; i++)
               {
                 if (listOfFiles[i].isFile())
                 {
                    String p1 = listOfFiles[i].getName();
                  //  System.out.println("["+i+"] " + p1);
                   BufferedReader br = new BufferedReader(new FileReader(listOfFiles[i].getPath()));

                    while((s=br.readLine())!=null)
                    {
                       st = new StringTokenizer(s, " ", false);
                       while(st.hasMoreTokens())
                       {
                           temp = st.nextToken();
                           if(swl.stopWord.contains(temp))
                           {
                               if(st.hasMoreTokens())
                               st.nextToken();
                               //System.out.println(temp);
                           }
                           else
                           {
                               if(temp.length() <=3||temp.length()>=15)
                               {
                                  if(st.hasMoreTokens())
                                  st.nextToken();
                               }
                               else
                               {
                                   temp = stem.DoSuffixStremmer(temp);
                                   // put the stemmer here
                                   if(keywordList.contains(temp)==false) //checking in keyword_array
                                   {
                                    keywordList.add(temp); // adding keyword to keyword_array
                                    bw.write(temp);
                                    countKeyword++;
                                    bw.newLine();
                                   }
                               }
                           }
                       } // while ends
                    } // while ends
                 }
              }
               bw.close();
         }
         System.out.println("");
         System.out.println("No of Documents – "+listOfFiles.length);
         System.out.println("No of keywords – "+countKeyword);
         System.out.println("");
         System.out.println("");

         countMatrix = new int[listOfFiles.length][keywordList.size()];

        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                countMatrix[i][j] =0;
            }
        }

        

           if (listOfFiles.length > 0)
          {
               for (int i = 0; i < listOfFiles.length; i++)
               {
                 if (listOfFiles[i].isFile())
                 {
                    String p1 = listOfFiles[i].getName();
                  //System.out.println("["+i+"] " + p1);
                   BufferedReader br = new BufferedReader(new FileReader(listOfFiles[i].getPath()));

                    while((s=br.readLine())!=null)
                    {
                       st = new StringTokenizer(s, " ", false);
                       while(st.hasMoreTokens())
                       {
                           temp = st.nextToken();
                           if(swl.stopWord.contains(temp))
                           {
                               if(st.hasMoreTokens())
                               st.nextToken();
                               //System.out.println(temp);
                           }
                           else
                           {
                               if(temp.length() <=3||temp.length()>=15)
                               {
                                  if(st.hasMoreTokens())
                                  st.nextToken();
                               }
                               else
                               {
                                   // put stemmer here
                                   temp = stem.DoSuffixStremmer(temp);
                                    if(keywordList.contains(temp)==true) // checking the keyword in keyword_array
                                   {
                                     //generating count matrix
                                    countMatrix[i][keywordList.indexOf(temp)] = countMatrix[i][keywordList.indexOf(temp)] + 1;
                                   }
                               }
                           }

                       } // while ends
                    } // while ends
                 }
              }
               bw.close();
            // System.out.println("no of keywords – "+ii);
         }

         System.out.println("************************** Count Matrix *************************");
         System.out.println("");

          for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                System.out.print(","+countMatrix[i][j]);
            }
             System.out.println(" ");
        }
          TDIDF_Matrix tM = new TDIDF_Matrix();
          tM.compute_tottal_no_words_in_doc();
          tM.compute_num_of_doc_in_which_word_i_appears();
          tM.compute_TDIDF(listOfFiles.length, keywordList.size());

       }// main closing
} // class closing

 

 

//  TDIDF_Matrix –> to build weighted matrix from count matrix

import java.text.DecimalFormat;

public class TDIDF_Matrix  extends ReadingMultipleFile{
public static double[][] tdidf;
int[] tottal_no_words_in_doc;
int[] num_of_doc_in_which_word_i_appears;
  DecimalFormat twoDForm = new DecimalFormat("0.00000");
    public TDIDF_Matrix() {
         tdidf = new double[listOfFiles.length][keywordList.size()];
         tottal_no_words_in_doc = new int[listOfFiles.length];
         num_of_doc_in_which_word_i_appears = new int[keywordList.size()];
    }

    public void compute_tottal_no_words_in_doc()
    {
        int sum = 0;
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++)
            {
                if((countMatrix[i][j])>0)
                 {
                     sum = sum+1;
                 }
            }
            tottal_no_words_in_doc[i]=sum;
            sum =0;
        }

        for (int i = 0; i < listOfFiles.length; i++) {
            System.out.println("Total no of words in document : "+i+" –> "+tottal_no_words_in_doc[i]);
        }
    }

    public void compute_num_of_doc_in_which_word_i_appears()
    {
        int sum = 0;
        for (int i = 0; i < keywordList.size(); i++) {
            for (int j = 0; j < listOfFiles.length; j++)
            {
                if((countMatrix[j][i])>0)
                 {
                     sum = sum+1;
                 }
            }
            num_of_doc_in_which_word_i_appears[i] = sum;
            sum = 0;
        }

        for (int i = 0; i < keywordList.size(); i++) {
            System.out.println("word : "+i +" occured in "+num_of_doc_in_which_word_i_appears[i]+" documents ");
        }
    }

    public void compute_TDIDF(int x , int y)
    {

        //initializing the tdidf
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size() ; j++) {
                tdidf[i][j]=0.00000;
            }

        }

       // ReadingMultipleFile re = new ReadingMultipleFile();
        for (int i = 0; i < listOfFiles.length; i++)
        {
            for (int j = 0; j < keywordList.size(); j++)
            {
               tdidf[i][j] = Double.valueOf( twoDForm.format((Double.valueOf(twoDForm.format((countMatrix[i][j]*10000)/1+tottal_no_words_in_doc[i])).doubleValue()/10000) * (Math.log( 50 / num_of_doc_in_which_word_i_appears[j])))).doubleValue();
            }//for closing
        } // for closing

        System.out.println("");
        System.out.println(" ************** TFIDF Matrix **************");
        System.out.println("");
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                System.out.print(tdidf[i][j]+"  ,  ");
            }
            System.out.println("");
        }

        //computeSVD();
    }

    public void computeSVD()
    {
        System.out.println("");
        System.out.println(" ************** TFIDF Matrix **************");
        System.out.println("");
        for (int i = 0; i < listOfFiles.length; i++) {
            for (int j = 0; j < keywordList.size(); j++) {
                System.out.print(tdidf[i][j]+"  ,  ");
            }
            System.out.println("");
        }
    }

}

Click here to see my simple stemmer implementation

Advertisements
Posted in Uncategorized. 5 Comments »

5 Responses to “Program code for building TDM”

  1. sita2901 Says:

    ……
    tdidf[i][j] = Double.valueOf( twoDForm.format((Double.valueOf(twoDForm.format((countMatrix[i][j]*10000)/1+tottal_no_words_in_doc[i])).doubleValue()/10000) * (Math.log( 50 / num_of_doc_in_which_word_i_appears[j])))).doubleValue();
    …..
    sir, where 1 (number in “1+tottal_no_words_in_doc[i]”) comes from?? why it added by 1??, thanks before..

    • shakthydoss Says:

      Hi sita ,

      It is just for my comfortable…… so that i can get the value that i want.
      You can omit 1 and continue with the native formula that i mentioned in the post.

      • sita2901 Says:

        hhmmm.. i see,..
        oke sir, thanks a lot

  2. alinux8per Says:

    hello shakthydoss;
    i have this error :

    Exception in thread “main” java.lang.NumberFormatException: For input string: “20000,00000”
    at sun.misc.FloatingDecimal.readJavaFormatString(FloatingDecimal.java:1242)
    at java.lang.Double.valueOf(Double.java:492)
    at lsimodelimplementation.TDIDF_Matrix.compute_TDIDF(TDIDF_Matrix.java:74)
    at lsimodelimplementation.ReadingMultipleFile.main(ReadingMultipleFile.java:182)
    Java Result: 1

    74 ligne is : tdidf[i][j] = Double.valueOf( twoDForm.format((Double.valueOf(twoDForm.format((countMatrix[i][j]*10000)/1+tottal_no_words_in_doc[i])).doubleValue()/10000) * (Math.log( listOfFiles.length / num_of_doc_in_which_word_i_appears[j])))).doubleValue();

    and 182 ligne is : tM.compute_TDIDF(listOfFiles.length, keywordList.size());

    can you help me please
    thanks before.

  3. shakthydoss Says:

    I guess the input string should be 200000000 instead of 20000,00000


Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: