Tuesday, February 22, 2011

Searching and Highlighting text in pdfs (Stored as BLOB) using PL/SQL.

We can search words withing a pdf (stored as BLOB in a table) and can highlight them and store them as html into CLOB using Oracle Text. This demo is also based on one of my posting in OTN.
SQL> /* The database version in which I am working */
SQL> SELECT * FROM v$version;

BANNER
----------------------------------------------------------------
Oracle Database 10g Enterprise Edition Release 10.2.0.3.0 - Prod
PL/SQL Release 10.2.0.3.0 - Production
CORE    10.2.0.3.0      Production
TNS for 32-bit Windows: Version 10.2.0.3.0 - Production
NLSRTL Version 10.2.0.3.0 - Production

SQL> /* Creating my demo table */
SQL> CREATE TABLE pdm(id_pk INTEGER PRIMARY KEY,fname VARCHAR2(30),pdf_file BLOB);

Table created.

SQL> /* Creating the directory object for latter use */
SQL> CREATE OR REPLACE DIRECTORY TEMP AS 'c:\';

Directory created.

SQL> /* Creating a procedure which will load the BLOBs (pdf files) into the table */
SQL> CREATE OR REPLACE PROCEDURE load_file(pi_id IN INTEGER, pfname IN VARCHAR2) IS
  2    src_file BFILE;
  3    dst_file BLOB;
  4    lgh_file BINARY_INTEGER;
  5  BEGIN
  6    src_file := bfilename('TEMP', pfname);
  7 
  8    INSERT INTO pdm
  9      (id_pk, fname, pdf_file)
 10    VALUES
 11      (pi_id, pfname, EMPTY_BLOB())
 12    RETURNING pdf_file INTO dst_file;
 13 
 14    DBMS_LOB.OPEN(src_file, dbms_lob.file_readonly);
 15    lgh_file := dbms_lob.getlength(src_file);
 16    DBMS_LOB.LOADFROMFILE(dst_file, src_file, lgh_file);
 17    dbms_lob.close(src_file);
 18    COMMIT;
 19  END load_file;
 20  /

Procedure created.

SQL> /* Inserting some rows into my table (pdm) */
SQL>
SQL> EXECUTE load_file(1,'plsql_new_in_11gr1.pdf');

PL/SQL procedure successfully completed.

SQL> EXECUTE load_file(3,'conditional_compilation.pdf');

PL/SQL procedure successfully completed.

SQL>
SQL> /* Checking the rows just inserted in the table */
SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file) file_length
  2  FROM pdm;

     ID_PK FNAME                          FILE_LENGTH
---------- ------------------------------ -----------
         1 plsql_new_in_11gr1.pdf              170264
         3 conditional_compilation.pdf         540594

SQL>
SQL> /* Creating the index */
SQL>
SQL> CREATE INDEX pdm_pdf_idx ON pdm(pdf_file) INDEXTYPE IS CTXSYS.CONTEXT;

Index created.

SQL> /* Checking for any errors*/
SQL> SELECT * FROM ctx_user_index_errors;

no rows selected

SQL> /* Checking the index */
SQL> SELECT idx_name,idx_table_owner,idx_table,idx_status,idx_type
  2  FROM ctx_user_indexes
  3  WHERE idx_name='PDM_PDF_IDX';

IDX_NAME                       IDX_TABLE_OWNER
------------------------------ ------------------------------
IDX_TABLE                      IDX_STATUS   IDX_TYP
------------------------------ ------------ -------
PDM_PDF_IDX                    SCOTT
PDM                            INDEXED      CONTEXT


SQL> /* Just gathering some statistics */
SQL> EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'PDM', cascade=>TRUE);

PL/SQL procedure successfully completed.

SQL> /* Firing some quries to check the setup */
SQL>
SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'The Catch 22')>0;

     ID_PK FNAME
---------- ------------------------------
         3 conditional_compilation.pdf

SQL>
SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'Usability language feature:')>0;

     ID_PK FNAME
---------- ------------------------------
         1 plsql_new_in_11gr1.pdf

SQL>  
SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'blah blah blah')>0;

no rows selected

SQL> 
SQL> /* Chreting table to hold the markuped html as CLOB */
SQL>
SQL> CREATE TABLE pdm_markup_table (
  2    query_id  NUMBER,
  3    document  CLOB
  4  );
CREATE TABLE pdm_markup_table (
             *
ERROR at line 1:
ORA-00955: name is already used by an existing object


SQL> /* Chreting table to hold the markuped html as CLOB */
SQL>
SQL> CREATE TABLE pdm_markup_table (
  2    query_id  NUMBER,
  3    document  CLOB
  4  );

Table created.

SQL> /* This anonymous PL/SQL block will search the text from the pdfs
SQL>    and place those into pdm_markup_table.document as html
SQL> */
SQL>
SQL> DECLARE
  2    l_keywords VARCHAR2(200) := 'The Catch 22';
  3    l_query_id INTEGER := 1;
  4  BEGIN
  5    FOR pdm_cur IN (SELECT id_pk, fname
  6                      FROM pdm
  7                     WHERE CONTAINS(pdf_file, l_keywords) > 0)
  8    /* although, We know from the query fired earlier that only one row will be returned
  9       We are checking as if there may be several pdfs with the search words
 10    */
 11     LOOP
 12      -- Generate HTML version of document with
 13      -- highlighted search terms.
 14      CTX_DOC.markup(index_name => 'PDM_PDF_IDX', --My index
 15                     textkey    => TO_CHAR(pdm_cur.id_pk),
 16                     text_query => l_keywords,
 17                     restab     => 'pdm_markup_table', --My table
 18                     query_id   => l_query_id,
 19                     plaintext  => FALSE,
 20                     tagset     => 'HTML_NAVIGATE');
 21   
 22      l_query_id := l_query_id + 1;
 23   
 24    END LOOP;
 25  END;
 26  /

PL/SQL procedure successfully completed.

SQL> SELECT query_id,DBMS_LOB.getlength(document)
  2  FROM pdm_markup_table;

  QUERY_ID DBMS_LOB.GETLENGTH(DOCUMENT)
---------- ----------------------------
         1                      1170359

SQL> /* A small snippet of the html document */
SQL> SELECT query_id,DBMS_LOB.substr(document,300,1)
  2  FROM pdm_markup_table;

  QUERY_ID
----------
DBMS_LOB.SUBSTR(DOCUMENT,300,1)
--------------------------------------------------------------------------------
         1
<html><body><p/>
<br /><div style="position:absolute;top:961px;left:68px;font-family:'Times New R
oman';font-size:12pt;white-space:nowrap;">
</div>
<div style="position:absolute;top:139px;left:408px;font-family:'Times New Roman'
;font-size:17pt;white-space:nowrap;">PL/SQL conditional compilation
</div


SQL> 

No comments :

Post a Comment