[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Aramorph-users] Arabic indexing and searching problem
From: |
Manjeet Chaudhary |
Subject: |
[Aramorph-users] Arabic indexing and searching problem |
Date: |
Fri, 22 Apr 2005 10:20:11 +0530 |
User-agent: |
Mozilla Thunderbird 1.0 (Windows/20041206) |
Hello All
I have used the Arabic analyzer designed by Mr Pierrick Brihaye. But I
am facing
some problems at the time of search.
1. I am using Arabic data in UTF-8 format for indexing.
2. I think indexing is working properly.
3. After indexing i am searching the data using Unicode.
4. But the searcher is unable to find data.
So could you please help me for solving this problem.
I have attached the copy of my code with this mail.
Kindly take a look at that & please suggest me a solution.
Thank you
Manjeet Chaudhary
Infogrid Pacific Pte Ltd
Pune, India.
###Java code for Indexing..
--------------------------------
/*
//---------------------------------------------------------------
Name :Indexer.java
Version :1.0
Purpose :For Creating Index
Author :Vinayak Rajeshirke
Created on:6 April 05
//----------------------------------------------------------------
*/
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
//import java.io.BufferedReader;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicStemAnalyzer;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.File;
import java.io.IOException;
//import java.io.FileReader;
public class Indexer{
public static void index(File indexDir, File dataDir) throws IOException
{
//Checking for the dierectory
if (!dataDir.exists() || !dataDir.isDirectory())
{
throw new IOException(dataDir + " does not exist or is not a
directory");
}
IndexWriter writer = new IndexWriter(indexDir, new
ArabicStemAnalyzer(), true);//Used for writing Index
indexDirectory(writer, dataDir);
writer.close();
}//end of index()
private static void indexDirectory(IndexWriter writer, File dir)
throws IOException
{
//Colleting all files in that directory
File[] files = dir.listFiles();
for (int i=0; i < files.length; i++)
{
File f = files[i];
if (f.isDirectory())
{
//if directory present in that directory then recursion
indexDirectory(writer, f);
} else if (f.getName().endsWith(".txt"))
{
indexFile(writer, f);
}
}
}//end of indexDirectory
private static void indexFile(IndexWriter writer, File f) throws
IOException
{
System.out.println("Indexing " + f.getName());
String inputEncoding = "UTF-8";//"Cp1256";
Document doc = new Document();
doc.add(Field.Text("contents",new InputStreamReader(new
FileInputStream(f),inputEncoding)));
doc.add(Field.Keyword("filename", f.getCanonicalPath()));
//adding index
writer.addDocument(doc);
}//end of indexFile
public static void main(String[] args) throws Exception
{
try{
/*if (args.length != 2)
{
throw new Exception("Usage: " + Indexer.class.getName() + "
<index dir> <data dir>");
}*/
File indexDir = new File("index");
File dataDir = new File("Arabic_Text");
index(indexDir, dataDir);
}//end of try
catch(Exception e){
System.out.println("Error occurs.Please Check the code");
}//end of finally
}//end of main
}//end of class Indexer
--------------------------------
### Java code for Searching
----------------------------------
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import gpl.pierrick.brihaye.aramorph.lucene.ArabicStemAnalyzer;
import java.io.File;
public class Searcher
{
public static void main(String[] args) throws Exception
{
/* if (args.length != 2)
{
throw new Exception("Usage: " + Searcher.class.getName() + "
<index dir> <query>");
}*/
File indexDir = new File("index");
String q ="\u0661\u0630\u0642\u0644";
//"\u0661\u0645\u0648\u0647";
//"\u0647\u0648\u0645\u0644";
//"\u0645\u0646\u0642\u0647\u062F\u0661";
//"\u0661\u062F\u0647\u0642\u0646\u0645";
//"\u0661\u0630\u0642\u0644";
//"\u0644\u0642\u0630\u0661"
if (!indexDir.exists() || !indexDir.isDirectory())
{
throw new Exception(indexDir + " is does not exist or is not
a directory.");
}
search(indexDir, q);
}//end of main
public static void search(File indexDir, String q) throws Exception
{
Directory fsDir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher is = new IndexSearcher(fsDir);
Query query = QueryParser.parse(q, "contents", new
ArabicStemAnalyzer());
Hits hits = is.search(query);
System.out.println("Found " + hits.length() + " document(s) that
matched query '" + q + "':");
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
System.out.println(doc.get("filename"));
}
}//end of search
}//end of Searcher
----------------------------------
- [Aramorph-users] Arabic indexing and searching problem,
Manjeet Chaudhary <=