We can search words withing a pdf (stored as BLOB in a table) and can highlight them and store them as html into CLOB using Oracle Text. This demo is also based on one of my posting in OTN.
SQL> /* The database version in which I am working */
SQL> SELECT * FROM v$version;
BANNER
----------------------------------------------------------------
Oracle Database 10g Enterprise Edition Release 10.2.0.3.0 - Prod
PL/SQL Release 10.2.0.3.0 - Production
CORE 10.2.0.3.0 Production
TNS for 32-bit Windows: Version 10.2.0.3.0 - Production
NLSRTL Version 10.2.0.3.0 - Production
SQL> /* Creating my demo table */
SQL> CREATE TABLE pdm(id_pk INTEGER PRIMARY KEY,fname VARCHAR2(30),pdf_file BLOB);
Table created.
SQL> /* Creating the directory object for latter use */
SQL> CREATE OR REPLACE DIRECTORY TEMP AS 'c:\';
Directory created.
SQL> /* Creating a procedure which will load the BLOBs (pdf files) into the table */
SQL> CREATE OR REPLACE PROCEDURE load_file(pi_id IN INTEGER, pfname IN VARCHAR2) IS
2 src_file BFILE;
3 dst_file BLOB;
4 lgh_file BINARY_INTEGER;
5 BEGIN
6 src_file := bfilename('TEMP', pfname);
7
8 INSERT INTO pdm
9 (id_pk, fname, pdf_file)
10 VALUES
11 (pi_id, pfname, EMPTY_BLOB())
12 RETURNING pdf_file INTO dst_file;
13
14 DBMS_LOB.OPEN(src_file, dbms_lob.file_readonly);
15 lgh_file := dbms_lob.getlength(src_file);
16 DBMS_LOB.LOADFROMFILE(dst_file, src_file, lgh_file);
17 dbms_lob.close(src_file);
18 COMMIT;
19 END load_file;
20 /
Procedure created.
SQL> /* Inserting some rows into my table (pdm) */
SQL>
SQL> EXECUTE load_file(1,'plsql_new_in_11gr1.pdf');
PL/SQL procedure successfully completed.
SQL> EXECUTE load_file(3,'conditional_compilation.pdf');
PL/SQL procedure successfully completed.
SQL>
SQL> /* Checking the rows just inserted in the table */
SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file) file_length
2 FROM pdm;
ID_PK FNAME FILE_LENGTH
---------- ------------------------------ -----------
1 plsql_new_in_11gr1.pdf 170264
3 conditional_compilation.pdf 540594
SQL>
SQL> /* Creating the index */
SQL>
SQL> CREATE INDEX pdm_pdf_idx ON pdm(pdf_file) INDEXTYPE IS CTXSYS.CONTEXT;
Index created.
SQL> /* Checking for any errors*/
SQL> SELECT * FROM ctx_user_index_errors;
no rows selected
SQL> /* Checking the index */
SQL> SELECT idx_name,idx_table_owner,idx_table,idx_status,idx_type
2 FROM ctx_user_indexes
3 WHERE idx_name='PDM_PDF_IDX';
IDX_NAME IDX_TABLE_OWNER
------------------------------ ------------------------------
IDX_TABLE IDX_STATUS IDX_TYP
------------------------------ ------------ -------
PDM_PDF_IDX SCOTT
PDM INDEXED CONTEXT
SQL> /* Just gathering some statistics */
SQL> EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'PDM', cascade=>TRUE);
PL/SQL procedure successfully completed.
SQL> /* Firing some quries to check the setup */
SQL>
SQL> SELECT id_pk,fname
2 FROM pdm
3 WHERE CONTAINS(pdf_file,'The Catch 22')>0;
ID_PK FNAME
---------- ------------------------------
3 conditional_compilation.pdf
SQL>
SQL> SELECT id_pk,fname
2 FROM pdm
3 WHERE CONTAINS(pdf_file,'Usability language feature:')>0;
ID_PK FNAME
---------- ------------------------------
1 plsql_new_in_11gr1.pdf
SQL>
SQL> SELECT id_pk,fname
2 FROM pdm
3 WHERE CONTAINS(pdf_file,'blah blah blah')>0;
no rows selected
SQL>
SQL> SELECT * FROM v$version;
BANNER
----------------------------------------------------------------
Oracle Database 10g Enterprise Edition Release 10.2.0.3.0 - Prod
PL/SQL Release 10.2.0.3.0 - Production
CORE 10.2.0.3.0 Production
TNS for 32-bit Windows: Version 10.2.0.3.0 - Production
NLSRTL Version 10.2.0.3.0 - Production
SQL> /* Creating my demo table */
SQL> CREATE TABLE pdm(id_pk INTEGER PRIMARY KEY,fname VARCHAR2(30),pdf_file BLOB);
Table created.
SQL> /* Creating the directory object for latter use */
SQL> CREATE OR REPLACE DIRECTORY TEMP AS 'c:\';
Directory created.
SQL> /* Creating a procedure which will load the BLOBs (pdf files) into the table */
SQL> CREATE OR REPLACE PROCEDURE load_file(pi_id IN INTEGER, pfname IN VARCHAR2) IS
2 src_file BFILE;
3 dst_file BLOB;
4 lgh_file BINARY_INTEGER;
5 BEGIN
6 src_file := bfilename('TEMP', pfname);
7
8 INSERT INTO pdm
9 (id_pk, fname, pdf_file)
10 VALUES
11 (pi_id, pfname, EMPTY_BLOB())
12 RETURNING pdf_file INTO dst_file;
13
14 DBMS_LOB.OPEN(src_file, dbms_lob.file_readonly);
15 lgh_file := dbms_lob.getlength(src_file);
16 DBMS_LOB.LOADFROMFILE(dst_file, src_file, lgh_file);
17 dbms_lob.close(src_file);
18 COMMIT;
19 END load_file;
20 /
Procedure created.
SQL> /* Inserting some rows into my table (pdm) */
SQL>
SQL> EXECUTE load_file(1,'plsql_new_in_11gr1.pdf');
PL/SQL procedure successfully completed.
SQL> EXECUTE load_file(3,'conditional_compilation.pdf');
PL/SQL procedure successfully completed.
SQL>
SQL> /* Checking the rows just inserted in the table */
SQL> SELECT id_pk,fname,DBMS_LOB.getlength(pdf_file) file_length
2 FROM pdm;
ID_PK FNAME FILE_LENGTH
---------- ------------------------------ -----------
1 plsql_new_in_11gr1.pdf 170264
3 conditional_compilation.pdf 540594
SQL>
SQL> /* Creating the index */
SQL>
SQL> CREATE INDEX pdm_pdf_idx ON pdm(pdf_file) INDEXTYPE IS CTXSYS.CONTEXT;
Index created.
SQL> /* Checking for any errors*/
SQL> SELECT * FROM ctx_user_index_errors;
no rows selected
SQL> /* Checking the index */
SQL> SELECT idx_name,idx_table_owner,idx_table,idx_status,idx_type
2 FROM ctx_user_indexes
3 WHERE idx_name='PDM_PDF_IDX';
IDX_NAME IDX_TABLE_OWNER
------------------------------ ------------------------------
IDX_TABLE IDX_STATUS IDX_TYP
------------------------------ ------------ -------
PDM_PDF_IDX SCOTT
PDM INDEXED CONTEXT
SQL> /* Just gathering some statistics */
SQL> EXEC DBMS_STATS.GATHER_TABLE_STATS(USER, 'PDM', cascade=>TRUE);
PL/SQL procedure successfully completed.
SQL> /* Firing some quries to check the setup */
SQL>
SQL> SELECT id_pk,fname
2 FROM pdm
3 WHERE CONTAINS(pdf_file,'The Catch 22')>0;
ID_PK FNAME
---------- ------------------------------
3 conditional_compilation.pdf
SQL>
SQL> SELECT id_pk,fname
2 FROM pdm
3 WHERE CONTAINS(pdf_file,'Usability language feature:')>0;
ID_PK FNAME
---------- ------------------------------
1 plsql_new_in_11gr1.pdf
SQL>
SQL> SELECT id_pk,fname
2 FROM pdm
3 WHERE CONTAINS(pdf_file,'blah blah blah')>0;
no rows selected
SQL>
SQL> /* Chreting table to hold the markuped html as CLOB */
SQL>
SQL> CREATE TABLE pdm_markup_table (
2 query_id NUMBER,
3 document CLOB
4 );
CREATE TABLE pdm_markup_table (
*
ERROR at line 1:
ORA-00955: name is already used by an existing object
SQL> /* Chreting table to hold the markuped html as CLOB */
SQL>
SQL> CREATE TABLE pdm_markup_table (
2 query_id NUMBER,
3 document CLOB
4 );
Table created.
SQL> /* This anonymous PL/SQL block will search the text from the pdfs
SQL> and place those into pdm_markup_table.document as html
SQL> */
SQL>
SQL> DECLARE
2 l_keywords VARCHAR2(200) := 'The Catch 22';
3 l_query_id INTEGER := 1;
4 BEGIN
5 FOR pdm_cur IN (SELECT id_pk, fname
6 FROM pdm
7 WHERE CONTAINS(pdf_file, l_keywords) > 0)
8 /* although, We know from the query fired earlier that only one row will be returned
9 We are checking as if there may be several pdfs with the search words
10 */
11 LOOP
12 -- Generate HTML version of document with
13 -- highlighted search terms.
14 CTX_DOC.markup(index_name => 'PDM_PDF_IDX', --My index
15 textkey => TO_CHAR(pdm_cur.id_pk),
16 text_query => l_keywords,
17 restab => 'pdm_markup_table', --My table
18 query_id => l_query_id,
19 plaintext => FALSE,
20 tagset => 'HTML_NAVIGATE');
21
22 l_query_id := l_query_id + 1;
23
24 END LOOP;
25 END;
26 /
PL/SQL procedure successfully completed.
SQL> SELECT query_id,DBMS_LOB.getlength(document)
2 FROM pdm_markup_table;
QUERY_ID DBMS_LOB.GETLENGTH(DOCUMENT)
---------- ----------------------------
1 1170359
SQL> /* A small snippet of the html document */
SQL> SELECT query_id,DBMS_LOB.substr(document,300,1)
2 FROM pdm_markup_table;
QUERY_ID
----------
DBMS_LOB.SUBSTR(DOCUMENT,300,1)
--------------------------------------------------------------------------------
1
<html><body><p/>
<br /><div style="position:absolute;top:961px;left:68px;font-family:'Times New R
oman';font-size:12pt;white-space:nowrap;">
</div>
<div style="position:absolute;top:139px;left:408px;font-family:'Times New Roman'
;font-size:17pt;white-space:nowrap;">PL/SQL conditional compilation
</div
SQL>
SQL>
SQL> CREATE TABLE pdm_markup_table (
2 query_id NUMBER,
3 document CLOB
4 );
CREATE TABLE pdm_markup_table (
*
ERROR at line 1:
ORA-00955: name is already used by an existing object
SQL> /* Chreting table to hold the markuped html as CLOB */
SQL>
SQL> CREATE TABLE pdm_markup_table (
2 query_id NUMBER,
3 document CLOB
4 );
Table created.
SQL> /* This anonymous PL/SQL block will search the text from the pdfs
SQL> and place those into pdm_markup_table.document as html
SQL> */
SQL>
SQL> DECLARE
2 l_keywords VARCHAR2(200) := 'The Catch 22';
3 l_query_id INTEGER := 1;
4 BEGIN
5 FOR pdm_cur IN (SELECT id_pk, fname
6 FROM pdm
7 WHERE CONTAINS(pdf_file, l_keywords) > 0)
8 /* although, We know from the query fired earlier that only one row will be returned
9 We are checking as if there may be several pdfs with the search words
10 */
11 LOOP
12 -- Generate HTML version of document with
13 -- highlighted search terms.
14 CTX_DOC.markup(index_name => 'PDM_PDF_IDX', --My index
15 textkey => TO_CHAR(pdm_cur.id_pk),
16 text_query => l_keywords,
17 restab => 'pdm_markup_table', --My table
18 query_id => l_query_id,
19 plaintext => FALSE,
20 tagset => 'HTML_NAVIGATE');
21
22 l_query_id := l_query_id + 1;
23
24 END LOOP;
25 END;
26 /
PL/SQL procedure successfully completed.
SQL> SELECT query_id,DBMS_LOB.getlength(document)
2 FROM pdm_markup_table;
QUERY_ID DBMS_LOB.GETLENGTH(DOCUMENT)
---------- ----------------------------
1 1170359
SQL> /* A small snippet of the html document */
SQL> SELECT query_id,DBMS_LOB.substr(document,300,1)
2 FROM pdm_markup_table;
QUERY_ID
----------
DBMS_LOB.SUBSTR(DOCUMENT,300,1)
--------------------------------------------------------------------------------
1
<html><body><p/>
<br /><div style="position:absolute;top:961px;left:68px;font-family:'Times New R
oman';font-size:12pt;white-space:nowrap;">
</div>
<div style="position:absolute;top:139px;left:408px;font-family:'Times New Roman'
;font-size:17pt;white-space:nowrap;">PL/SQL conditional compilation
</div
SQL>
No comments :
Post a Comment