Thursday, February 17, 2011

Searching text from pdf file (BLOB) from PL/SQL.

This is based on one of my posting in OTN. We will use Oracle Text to search a string from pdf file stored in BLOB column.

SQL> SELECT * FROM v$version;

BANNER
----------------------------------------------------------------
Oracle Database 10g Enterprise Edition Release 10.2.0.3.0 - Prod
PL/SQL Release 10.2.0.3.0 - Production
CORE    10.2.0.3.0      Production
TNS for 32-bit Windows: Version 10.2.0.3.0 - Production
NLSRTL Version 10.2.0.3.0 - Production

SQL> /* Creating my demo table */
SQL> CREATE TABLE pdm(id_pk INTEGER,fname VARCHAR2(30),pdf_file BLOB);

Table created.
SQL> /* Creating the directory object for latter use */
SQL> CREATE OR REPLACE DIRECTORY TEMP AS 'c:\';

Directory created.

SQL> /* Creating a procedure which will load the BLOBs (pdf files) into the table */
SQL> CREATE OR REPLACE PROCEDURE load_file(pi_id IN INTEGER, pfname IN VARCHAR2) IS
  2    src_file BFILE;
  3    dst_file BLOB;
  4    lgh_file BINARY_INTEGER;
  5  BEGIN
  6    src_file := bfilename('TEMP', pfname);
  7 
  8    INSERT INTO pdm
  9      (id_pk, fname, pdf_file)
 10    VALUES
 11      (pi_id, pfname, EMPTY_BLOB())
 12    RETURNING pdf_file INTO dst_file;
 13 
 14    DBMS_LOB.OPEN(src_file, dbms_lob.file_readonly);
 15    lgh_file := dbms_lob.getlength(src_file);
 16    DBMS_LOB.LOADFROMFILE(dst_file, src_file, lgh_file);
 17    dbms_lob.close(src_file);
 18    COMMIT;
 19  END load_file;
 20  /

Procedure created.

SQL> EXECUTE load_file(1,'plsql_new_in_11gr1.pdf');

PL/SQL procedure successfully completed.

SQL> EXECUTE load_file(2,'Model clause.pdf');

PL/SQL procedure successfully completed.

SQL>
SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file)
  2  FROM pdm;

     ID_PK FNAME                          DBMS_LOB.GETLENGTH(PDF_FILE)
---------- ------------------------------ ----------------------------
         1 plsql_new_in_11gr1.pdf                               170264
         2 Model clause.pdf                                    4288164

SQL> /* Creating the index */
SQL> CREATE INDEX pdm_pdf_idx ON pdm(pdf_file) INDEXTYPE IS CTXSYS.CONTEXT;

Index created.

SQL> EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'PDM', cascade=>TRUE);

PL/SQL procedure successfully completed.

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'PL/SQL')>0;

     ID_PK FNAME
---------- ------------------------------
         1 plsql_new_in_11gr1.pdf

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'Fine-grained access control')>0;

     ID_PK FNAME
---------- ------------------------------
         1 plsql_new_in_11gr1.pdf

SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file)
  2  FROM pdm;

     ID_PK FNAME                          DBMS_LOB.GETLENGTH(PDF_FILE)
---------- ------------------------------ ----------------------------
         1 plsql_new_in_11gr1.pdf                               170264
         2 Model clause.pdf                                    4288164

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'Saubhik')>0;

no rows selected

SQL> EXECUTE load_file(3,'plsql_conditional_compilation.pdf');
BEGIN load_file(3,'plsql_conditional_compilation.pdf'); END;

*
ERROR at line 1:
ORA-12899: value too large for column "SCOTT"."PDM"."FNAME" (actual: 33,
maximum: 30)
ORA-06512: at "SCOTT.LOAD_FILE", line 8
ORA-06512: at line 1


SQL> EXECUTE load_file(3,'conditional_compilation.pdf');

PL/SQL procedure successfully completed.

SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file)
  2  FROM pdm;

     ID_PK FNAME                          DBMS_LOB.GETLENGTH(PDF_FILE)
---------- ------------------------------ ----------------------------
         1 plsql_new_in_11gr1.pdf                               170264
         2 Model clause.pdf                                    4288164
         3 conditional_compilation.pdf                          540594

SQL> EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'PDM', cascade=>TRUE);

PL/SQL procedure successfully completed.

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'conditional')>0;

     ID_PK FNAME
---------- ------------------------------
         1 plsql_new_in_11gr1.pdf
         3 conditional_compilation.pdf

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'The Catch 22')>0;

     ID_PK FNAME
---------- ------------------------------
         3 conditional_compilation.pdf

SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file)
  2  FROM pdm;

     ID_PK FNAME                          DBMS_LOB.GETLENGTH(PDF_FILE)
---------- ------------------------------ ----------------------------
         1 plsql_new_in_11gr1.pdf                               170264
         2 Model clause.pdf                                    4288164
         3 conditional_compilation.pdf                          540594

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'Programmers often need to trace the execution')>0;

     ID_PK FNAME
---------- ------------------------------
         3 conditional_compilation.pdf

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'Usability language feature:')>0;

     ID_PK FNAME
---------- ------------------------------
         1 plsql_new_in_11gr1.pdf

SQL> SELECT id_pk,fname
  2  FROM pdm
  3  WHERE CONTAINS(pdf_file,'blah blah blah')>0;

no rows selected



1 comment :

  1. I'm doing the same. But still not able to search anything from BLOB(pdf file).

    ReplyDelete