pdf2text.php
4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
<?php
/**
* The code has been taken from hashbangcode.
*
* On @license http://www.hashbangcode.com/about it says: "All of the code
* placed onto this site has been tested to the best of our ability and
* resources so it should work out of the box. If you spot any problems then
* please let us know! You should be aware the all the code here is "use at
* your own risk" and we can't take any responsibility for loss of data or
* server downtime as a result of the code on this site." which is as close to
* a license as it gets. :(
*
* @author philipnorton42: @link http://www.hashbangcode.com/users/philipnorton42
*
* @link https://github.com/philipnorton42/PDFSearch
* @link http://www.hashbangcode.com/blog/zend-lucene-and-pdf-documents-part-2-pdf-data-extraction-437.html
*/
class App_Search_Helper_PdfParser {
/**
* Convert a PDF into text.
*
* @param string $data The pdf content.
* @return string The extracted text from the PDF
*/
public function pdf2txt($data) {
/**
* Split apart the PDF document into sections. We will address each
* section separately.
*/
$a_obj = $this->getDataArray($data, "obj", "endobj");
$j = 0;
/**
* Attempt to extract each part of the PDF document into a "filter"
* element and a "data" element. This can then be used to decode the
* data.
*/
foreach ($a_obj as $obj) {
$a_filter = $this->getDataArray($obj, "<<", ">>");
if (is_array($a_filter) && isset($a_filter[0])) {
$a_chunks[$j]["filter"] = $a_filter[0];
$a_data = $this->getDataArray($obj, "stream", "endstream");
if (is_array($a_data) && isset($a_data[0])) {
$a_chunks[$j]["data"] = trim(substr($a_data[0], strlen("stream"), strlen($a_data[0]) - strlen("stream") - strlen("endstream")));
}
$j++;
}
}
$result_data = NULL;
// decode the chunks
foreach ($a_chunks as $chunk) {
// Look at each chunk decide if we can decode it by looking at the contents of the filter
if (isset($chunk["data"])) {
// look at the filter to find out which encoding has been used
if (strpos($chunk["filter"], "FlateDecode") !== false) {
// Use gzuncompress but supress error messages.
$data = @ gzuncompress($chunk["data"]);
if (trim($data) != "") {
// If we got data then attempt to extract it.
$result_data .= ' ' . $this->ps2txt($data);
}
}
}
}
/**
* Make sure we don't have large blocks of white space before and after
* our string. Also extract alphanumerical information to reduce
* redundant data.
*/
$result_data = trim(preg_replace('/([^a-z0-9 ])/i', ' ', $result_data));
// Return the data extracted from the document.
if ($result_data == "") {
return NULL;
} else {
return $result_data;
}
}
/**
* Convert a small chunk of data into text.
*
* @param string $ps_data The chunk of data to convert.
* @return string The string extracted from the data.
*/
public function ps2txt($ps_data) {
// Stop this function returning bogus information from non-data string.
if (ord($ps_data[0]) < 10) {
return $ps_data;
}
if (substr($ps_data, 0, 8) == '/CIDInit') {
return '';
}
$result = "";
$a_data = $this->getDataArray($ps_data, "[", "]");
// Extract the data.
if (is_array($a_data)) {
foreach ($a_data as $ps_text) {
$a_text = $this->getDataArray($ps_text, "(", ")");
if (is_array($a_text)) {
foreach ($a_text as $text) {
$result .= substr($text, 1, strlen($text) - 2);
}
}
}
}
// Didn't catch anything, try a different way of extracting the data
if (trim($result) == "") {
// the data may just be in raw format (outside of [] tags)
$a_text = $this->getDataArray($ps_data, "(", ")");
if (is_array($a_text)) {
foreach ($a_text as $text) {
$result .= substr($text, 1, strlen($text) - 2);
}
}
}
// Remove any stray characters left over.
$result = preg_replace('/\b([^a|i])\b/i', ' ', $result);
return trim($result);
}
/**
* Convert a section of data into an array, separated by the start and end words.
*
* @param string $data The data.
* @param string $start_word The start of each section of data.
* @param string $end_word The end of each section of data.
* @return array The array of data.
*/
public function getDataArray($data, $start_word, $end_word) {
$start = 0;
$end = 0;
$a_result = array();
while ($start !== false && $end !== false) {
$start = strpos($data, $start_word, $end);
$end = strpos($data, $end_word, $start);
if ($end !== false && $start !== false) {
// data is between start and end
$a_result[] = substr($data, $start, $end - $start + strlen($end_word));
}
}
return $a_result;
}
}