TextContentIndexer.java
| Index Score | ||
|---|---|---|
![]() |
![]() |
org.apache.slide.index |
![]() |
![]() |
Jakarta Slide |
View: Reasons, Metrics, Source Code
These are the metrics that contribute to the Enerjy Score for this file, ranked by impact. So the metrics listed at the top influence the score to a greater extent that the metrics listed at the bottom.
/*
* $Header$
* $Revision: 510360 $
* $Date: 2007-02-21 21:55:26 -0500 (Wed, 21 Feb 2007) $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.index;
import org.apache.slide.search.IndexException;
import org.apache.slide.search.basic.IBasicExpressionFactory;
import org.apache.slide.util.logger.Logger;
import org.apache.slide.common.*;
import org.apache.slide.content.NodeRevisionDescriptors;
import org.apache.slide.content.NodeRevisionNumber;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.store.IndexStore;
import org.apache.slide.extractor.ExtractorManager;
import org.apache.slide.extractor.ExtractorException;
import org.apache.slide.extractor.ContentExtractor;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import java.io.IOException;
import java.io.CharArrayReader;
import java.io.ByteArrayInputStream;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
/**
* Lucene based IndexStore for indexing content.
* Apart from indexing the content as text field it adds
* indexes using the registered content extractor.
*/
public class TextContentIndexer extends XAServiceBase implements IndexStore {
private static final String INDEX_PATH = "indexpath";
private static final String INCLUDES = "includes";
private static final String ANALYZER = "analyzer";
public static final String URI_FIELD = "uri";
public static final String CONTENT_TEXT = "content";
private String indexpath = "";
private Collection includes;
private String analyzerClassName;
private Analyzer analyzer;
private boolean started = false;
/**
* Create Index, if not yet done.
*
* @param token a NamespaceAccessToken
*
* @throws org.apache.slide.common.ServiceInitializationFailedException
*
*/
public void initialize(NamespaceAccessToken token)
throws ServiceInitializationFailedException
{
initAnalyzer();
IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, analyzer, false);
}
// will fail, if not yet exists
catch (IOException e)
{
try
{
// create index
indexWriter = new IndexWriter(indexpath, analyzer, true);
}
catch (IOException ex)
{
getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, ex);
}
}
try
{
indexWriter.close();
}
catch (IOException e)
{
getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException (this, e);
}
getLogger().log("Lucene is correctly initialized", LOG_CHANNEL, Logger.INFO);
}
/**
* Index an object content.
*
* @param uri Uri
* @exception IndexException Error accessing the Data Source
*/
synchronized public void createIndex (Uri uri,
NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent)
throws IndexException
{
if (!isIncluded(uri.toString())) return;
IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, analyzer, false);
// Create document
Document doc = new Document();
doc.add(new Field(URI_FIELD, uri.toString(), Store.YES, Index.UN_TOKENIZED));
doc.add(new Field(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
if ( revisionContent != null && revisionDescriptor != null ) {
List extractor = ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(), (NodeRevisionDescriptors)null, revisionDescriptor);
for ( int i = 0, l = extractor.size(); i < l; i++ ) {
Reader reader = ((ContentExtractor)extractor.get(i)).extract(new ByteArrayInputStream(revisionContent.getContentBytes()));
doc.add(new Field(CONTENT_TEXT, reader));
}
}
indexWriter.addDocument(doc);
indexWriter.optimize();
getLogger().log(
"Added '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index",
LOG_CHANNEL,
Logger.INFO);
}
catch (IOException e)
{
getLogger().log(
"Error creating an index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
catch( ExtractorException e)
{
getLogger().log(
"Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Method updateIndex
*
* @param uri an Uri
* @param revisionDescriptor a NodeRevisionDescriptor
* @param revisionContent a NodeRevisionContent
*
* @throws IndexException
*
*/
synchronized public void updateIndex(Uri uri,
NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent)
throws IndexException
{
if (!isIncluded(uri.toString())) return;
IndexWriter indexWriter = null;
try
{
// Delete entries from index
IndexReader indexReader = IndexReader.open(indexpath);
Term term = new Term(URI_FIELD, uri.toString());
indexReader.deleteDocuments(term);
indexReader.close();
indexWriter = new IndexWriter(indexpath, analyzer, false);
// Create document
Document doc = new Document();
doc.add(new Field(URI_FIELD, uri.toString(), Store.YES, Index.UN_TOKENIZED));
doc.add(new Field(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
if ( revisionContent != null && revisionDescriptor != null ) {
List extractor = ExtractorManager.getInstance().getContentExtractors(uri.getNamespace().getName(), (NodeRevisionDescriptors)null, revisionDescriptor);
for ( int i = 0, l = extractor.size(); i < l; i++ ) {
Reader reader = ((ContentExtractor)extractor.get(i)).extract(new ByteArrayInputStream(revisionContent.getContentBytes()));
doc.add(new Field(CONTENT_TEXT, reader));
}
}
indexWriter.addDocument(doc);
indexWriter.optimize();
if (getLogger().isEnabled(Logger.DEBUG)) {
getLogger().log(
"Updated '" + uri + " - " + revisionDescriptor.getRevisionNumber() + "' to index",
LOG_CHANNEL,
Logger.DEBUG);
}
}
catch (IOException e)
{
getLogger().log(
"Error updating the index with " + uri + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
catch( ExtractorException e)
{
getLogger().log(
"Error extracting content from " + uri + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Drop an object revision from the index.
*
* @param uri Uri
* @exception IndexException
*/
synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
throws IndexException
{
if (!isIncluded(uri.toString())) return;
if (number == NodeRevisionNumber.HIDDEN_0_0) return;
IndexWriter indexWriter = null;
try
{
IndexReader indexReader = IndexReader.open(indexpath);
Term term = new Term(URI_FIELD, uri.toString());
indexReader.deleteDocuments(term);
indexReader.close();
indexWriter = new IndexWriter(indexpath, analyzer, false);
indexWriter.optimize();
if (getLogger().isEnabled(Logger.DEBUG)) {
getLogger().log(
"Deleted '" + uri + "' from the index",
LOG_CHANNEL,
Logger.DEBUG);
}
}
catch (IOException e)
{
getLogger().log("Impossible to delete " + uri + " - " + number + " from the Lucene index");
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Method getFactory
*
* @return an IBasicExpressionFactory
*
*/
public IBasicExpressionFactory getBasicExpressionFactory()
{
return new TextContainsExpressionFactory(indexpath, analyzer);
}
/**
* Connects to the underlying data source (if any is needed).
*
* @exception ServiceConnectionFailedException Connection failed
*/
public void connect() throws ServiceConnectionFailedException
{
getLogger().log(
"TextContentIndexer: connect",
LOG_CHANNEL,
Logger.INFO);
started = true;
}
/**
* This function tells whether or not the service is connected.
*
* @return boolean true if we are connected
* @exception ServiceAccessException Service access error
*/
public boolean isConnected() throws ServiceAccessException
{
return started;
}
/**
* Parametrize the service. This index store expects a parameter
* "indexpath" to contain the path to the directory to store the index.
* Another optional parameter "includes" lists the paths of resources
* that are to be indexed in a comma-separated format.
* Everything under an included path is indexed. If not specified all
* resources will be indexed.
*
* @param parameters Hashtable containing the parameters' names
* and associated values
* @exception ServiceParameterErrorException Incorrect service parameter
* @exception ServiceParameterMissingException Service parameter missing
*/
public void setParameters (Hashtable parameters) throws ServiceParameterErrorException, ServiceParameterMissingException
{
indexpath = (String)parameters.get (INDEX_PATH);
if (indexpath == null || indexpath.length() == 0) {
throw new ServiceParameterMissingException (this, INDEX_PATH);
}
String includes = (String) parameters.get(INCLUDES);
if (includes != null && includes.length() > 0) {
StringTokenizer tokenizer = new StringTokenizer(includes, ",");
this.includes = new ArrayList(tokenizer.countTokens());
while (tokenizer.hasMoreTokens()) {
this.includes.add(tokenizer.nextToken());
}
}
analyzerClassName = (String)parameters.get (ANALYZER);
}
/**
* Disconnects from the underlying data source.
*
* @exception ServiceDisconnectionFailedException Disconnection failed
*/
public void disconnect() throws ServiceDisconnectionFailedException
{
getLogger().log(
"TextContentIndexer: disconnect",
LOG_CHANNEL,
Logger.INFO);
started = false;
}
/**
* Deletes service underlying data source, if possible (and meaningful).
*
* @exception ServiceResetFailedException Reset failed
*/
public void reset() throws ServiceResetFailedException
{
getLogger().log(
"TextContentIndexer: reset",
LOG_CHANNEL,
Logger.INFO);
}
protected Reader readContent(NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent) throws IOException {
return new CharArrayReader (revisionContent.getContent());
}
protected boolean isIncluded(String uri) {
if (includes == null) return true;
Iterator iter = includes.iterator();
while (iter.hasNext()) {
if (uri.startsWith((String) iter.next())) {
return true;
}
}
return false;
}
protected void initAnalyzer() throws ServiceInitializationFailedException {
if (analyzerClassName == null || analyzerClassName.length() == 0) {
getLogger().log("using Lucene StandardAnalyzer", LOG_CHANNEL, Logger.INFO);
analyzer = new StandardAnalyzer();
} else {
getLogger().log("loading Lucene analyzer: " + analyzerClassName, LOG_CHANNEL, Logger.INFO);
try {
Class analyzerClazz = Class.forName(analyzerClassName);
analyzer = (Analyzer)analyzerClazz.newInstance();
} catch (ClassNotFoundException cnfe) {
getLogger().log("Error while instantiating analyzer " +
analyzerClassName + cnfe.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, cnfe);
} catch (InstantiationException ie) {
getLogger().log("Error while instantiating analyzer " +
analyzerClassName + ie.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, ie);
} catch (IllegalAccessException iae) {
getLogger().log("Error while instantiating analyzer " +
analyzerClassName + iae.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, iae);
}
}
}
}
The table below shows all metrics for TextContentIndexer.java.




