aramorph-users
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Aramorph-users] Arabic indexing and searching problem


From: Manjeet Chaudhary
Subject: [Aramorph-users] Arabic indexing and searching problem
Date: Fri, 22 Apr 2005 10:20:11 +0530
User-agent: Mozilla Thunderbird 1.0 (Windows/20041206)

Hello All

I have used the Arabic analyzer designed by Mr Pierrick Brihaye. But I am facing
some problems at the time of search.

1.   I am using Arabic data in UTF-8 format for indexing.
2.   I think indexing is working properly.
3.   After indexing i am searching the data using Unicode.
4.   But the searcher is unable to find data.

So could you please help me for solving this problem.
I have attached the copy of my code with this mail.

Kindly take a look at that & please suggest me a solution.

Thank you
Manjeet Chaudhary
Infogrid Pacific Pte Ltd
Pune, India.


###Java code for Indexing..
--------------------------------
/*
//---------------------------------------------------------------
Name      :Indexer.java
Version   :1.0
Purpose   :For Creating Index
Author    :Vinayak Rajeshirke
Created on:6 April 05
//----------------------------------------------------------------
*/
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
//import java.io.BufferedReader;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicStemAnalyzer;

import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.File;
import java.io.IOException;
//import java.io.FileReader;

public class Indexer{
public static void index(File indexDir, File dataDir) throws IOException
{
    //Checking for the dierectory
    if (!dataDir.exists() || !dataDir.isDirectory())
        {
        throw new IOException(dataDir + " does not exist or is not a
directory");
        }
    IndexWriter writer = new IndexWriter(indexDir, new
ArabicStemAnalyzer(), true);//Used for writing Index
    indexDirectory(writer, dataDir);
    writer.close();
}//end of index()

private static void indexDirectory(IndexWriter writer, File dir)
throws IOException
{
//Colleting all files in that directory
    File[] files = dir.listFiles();

    for (int i=0; i < files.length; i++)
    {
        File f = files[i];
        if (f.isDirectory())
        {
            //if directory present in that directory then recursion
            indexDirectory(writer, f);
        } else if (f.getName().endsWith(".txt"))
        {
            indexFile(writer, f);
        }
    }
}//end of indexDirectory

private static void indexFile(IndexWriter writer, File f) throws
IOException
{
    System.out.println("Indexing " + f.getName());
  String inputEncoding = "UTF-8";//"Cp1256";
    Document doc = new Document();
  doc.add(Field.Text("contents",new InputStreamReader(new
FileInputStream(f),inputEncoding)));
  doc.add(Field.Keyword("filename", f.getCanonicalPath()));
  //adding index
    writer.addDocument(doc);
}//end of indexFile

public static void main(String[] args) throws Exception
{
try{
    /*if (args.length != 2)
    {
        throw new Exception("Usage: " + Indexer.class.getName() + "
<index dir> <data dir>");
    }*/
    File indexDir = new File("index");
    File dataDir = new File("Arabic_Text");
    index(indexDir, dataDir);
    }//end of try
    catch(Exception e){
    System.out.println("Error occurs.Please Check the code");
    }//end of finally
}//end of main
}//end of class Indexer
--------------------------------

### Java code for Searching
----------------------------------
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicStemAnalyzer;
import java.io.File;

public class Searcher
{
  public static void main(String[] args) throws Exception
      {
     /* if (args.length != 2)
          {
          throw new Exception("Usage: " + Searcher.class.getName() + "
<index dir> <query>");
           }*/

      File indexDir = new File("index");
      String q ="\u0661\u0630\u0642\u0644";
      //"\u0661\u0645\u0648\u0647";
      //"\u0647\u0648\u0645\u0644";
      //"\u0645\u0646\u0642\u0647\u062F\u0661";
      //"\u0661\u062F\u0647\u0642\u0646\u0645";
      //"\u0661\u0630\u0642\u0644";
      //"\u0644\u0642\u0630\u0661"

      if (!indexDir.exists() || !indexDir.isDirectory())
          {
          throw new Exception(indexDir + " is does not exist or is not
a directory.");
          }

      search(indexDir, q);
     }//end of main




public static void search(File indexDir, String q)  throws Exception
{
  Directory fsDir = FSDirectory.getDirectory(indexDir, false);
  IndexSearcher is = new IndexSearcher(fsDir);

  Query query = QueryParser.parse(q, "contents", new
ArabicStemAnalyzer());
  Hits hits = is.search(query);
  System.out.println("Found " + hits.length() + " document(s) that
matched query '" + q + "':");
  for (int i = 0; i < hits.length(); i++) {
      Document doc = hits.doc(i);
      System.out.println(doc.get("filename"));
  }
}//end of search
}//end of Searcher
----------------------------------




reply via email to

[Prev in Thread] Current Thread [Next in Thread]