Total Pageviews

Friday 23 May 2014

Real-time NLP API.....

The real-time natural language processing (NLP) API allows users to perform ad-hoc text analytics on documents.
Real-time text analysis uses the existing text analytics resources that are defined for a collection, but analyzes documents without adding them to the index. Users can immediately check the analysis results without waiting for the index to be built or updated.

Requirements

The following system set-up is required to use the real-time NLP API:
  • Real-time NLP requires a text analytics collection that hosts text analytics resources.
  • Administrators configure the collection for real-time NLP by configuring the facet tree, dictionaries, and patterns for text extraction, just as they would for typical text analytics collections. The result of real-time NLP reflects the configuration of that collection.
  • The parse and index sessions for the collection must be running because these sessions provide the document processing engine for the real-time NLP API.
  • Search sessions for the collection must be running because these sessions serve as the gateway for the real-time NLP API

Sample plug-in application for non-web crawlers

The sample crawler plug-in application shows how you can change security token values, metadata, and the content of crawled documents.
package sample;

import java.io.BufferedWriter;
import java.io.OutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.List;

import com.ibm.es.crawler.plugin.AbstractCrawlerPlugin;
import com.ibm.es.crawler.plugin.Content;
import com.ibm.es.crawler.plugin.CrawledData;
import com.ibm.es.crawler.plugin.CrawlerPluginException;
import com.ibm.es.crawler.plugin.FieldMetadata;

/**
 * The <code>MyCrawlerPlugin</code> is a sample crawler plugin module.
 */
public class MyCrawlerPlugin extends AbstractCrawlerPlugin {

   
   /**
    * Default constructor.
    */
   public MyCrawlerPlugin() {
      super();
   }

   /**
    * Initialize this object.
    * 
    * This sample program has nothing in this method.
    * 
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#init()
    */
   public void init() throws CrawlerPluginException {

      /*
       * [Tips]
       * If your crawler plugin module requires something to do for 
       * initialization, add the code here.
       * [Example]
       * Get JDBC connection for your local system.
       * connection = DriverManager.getConnection("jdbc::db2::xxxx);
       */

   }

   /**
    * Returns the Boolean value for metadata usage.
    * 
    * This sample program returns <code>true</code>.
    *  
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#isMetadataUsed()
    */
   public boolean isMetadataUsed() {

      /*
       * [Tips]
       * If your crawler plugin module updates both metadata and security 
       * tokens, returns true.
       * If your cralwer plugin module updates security tokens only, 
       * returns false.
       * [Example]
       * Close JDBC connection for your local system.
       * connection.close(); 
       */
      return true;
   }

   /**
    * Terminate this object.
    * 
    * This sample program has nothing in this method.
    * 
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#term()
    */
   public void term() throws CrawlerPluginException {

      /*
       * [Tips]
       * If your crawler plugin module requires something to do 
       * for termination, add the code here.
       */

      return;

   }

   /**
    * Update crawled data.
    * 
    * This sample program updates the security tokens.
    * 
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#updateDocument
      (com.ibm.es.crawler.plugin.CrawledData)
    */
   public CrawledData updateDocument(CrawledData crawledData) 
   throws CrawlerPluginException {

      // Get uri string, security tokens, and field metadata
      String url = crawledData.getURI();
      String securityTokens = crawledData.getSecurityTokens();
      List metadataList = crawledData.getMetadataList();
      if (metadataList == null) {
         metadataList = new ArrayList();
      }

      /*
       * [Tips]
       * If your crawler plugin module rejects some crawled data,
       * add the check code here and returns null. 
       */
      // This sample always returns updated document.
      if (false) {
         return null;
      }

      /*
       * [Tips]
       * If your crawler plugin module updates the security tokens,
       * add the code here.
       */
      // update security token (for sample)
      String newToken = "SampleToken";
      String newSecurityTokens = securityTokens + "," + newToken;
      crawledData.setSecurityTokes(newSecurityTokens);

      /*
       * [Tips]
       * If your crawler plugin module updates metadata,
       * add the code here.
       */
      // update metadata (for sample)
      FieldMetadata newFieldMetaData = new FieldMetadata("copyright", "IBM");
      metadataList.add(newFieldMetaData);
      crawledData.setMetadataList(metadataList);
      
      
      /*
       * Set language. 
       */
      crawledData.setLanguage("en");
      crawledData.setLanguageAutoDetection(true);
      
      /*
       * Update Content. since 8.3
       */
      Content content = crawledData.getOriginalContent();
      
      java.io.InputStream in = null;
      
      try{
         // if the original crawled content is null, create the new content.
         if(content == null){
            crawledData.createNewContent();
            content = crawledData.createNewContent();
         } else {
            // if the original crawled content exists, get InputStream 
            // object to access it.
            in = content.getInputStream();
            
            // read the content
            
            in.close();
         }
      }catch(IOException ioe){
         throw new CrawlerPluginException(ioe);
      }
      
      // set information against the content.
      content.setCodepage("UTF-8");
      content.setCodepageAutoDetection(true);
      content.setMimeType("text/plain");

// Overwrite the content.
      try{
         
         OutputStream outputStream = content.getOutputStream();

         // write content to OutputStream
         String newText = "The new content of plain text ";
         BufferedWriter br = new BufferedWriter(new OutputStreamWriter
         (outputStream, "UTF-8"));
         br.write(newText);
         br.flush();
         br.close();
         
      }catch(IOException ioe){
         throw new CrawlerPluginException(ioe);
      }
      
      // Submit change for the content.
      crawledData.submitContent(content);
      
      return crawledData;
   }
 
   /* (non-Javadoc)
    * @see com.ibm.es.crawler.plugin.AbstractCrawlerPlugin#isContentUsed()
    */
   public boolean isContentUsed() {
      return true;
   }

}

No comments:

Post a Comment