/[corp_html]/mnogo/parse.inc
This is repository of my old source code which isn't updated any more. Go to git.rot13.org for current projects!
ViewVC logotype

Contents of /mnogo/parse.inc

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1.2 - (show annotations)
Wed Mar 27 16:00:18 2002 UTC (22 years ago) by dpavlin
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +0 -0 lines
FILE REMOVED
*** empty log message ***

1 <?
2
3 // -----------------------------------------------
4 // ParseDocText($text)
5 // -----------------------------------------------
6 function ParseDocText($text){
7 global $all_words;
8 global $stopword_arr;
9 global $highlight_stopwords;
10 global $hlbeg, $hlend;
11
12 $str=$text;
13 for ($i=0; $i<count($all_words); $i++) {
14 $word=$all_words[$i];
15 $str = preg_replace("/([\s\t\r\n\~\!\@\#\$\%\^\&\*\(\)\-\_\=\+\\\|\{\}\[\]\;\:\'\"\<\>\?\/\,\.]+)($word)/i","\\1$hlbeg\\2$hlend",$str);
16 $str = preg_replace("/^($word)/i","$hlbeg\\1$hlend",$str);
17 }
18
19 if ($highlight_stopwords == 'yes') {
20 reset($stopword_arr);
21 while(list($word,$temp)=each($stopword_arr)) {
22 $str = preg_replace("/([\s\t\r\n\~\!\@\#\$\%\^\&\*\(\)\-\_\=\+\\\|\{\}\[\]\;\:\'\"\<\>\?\/\,\.]+)($word)/i","\\1$hlbeg\\2$hlend",$str);
23 $str = preg_replace("/^($word)/i","$hlbeg\\1$hlend",$str);
24 }
25 }
26
27 return $str;
28 }
29
30 // -----------------------------------------------
31 // ParseStr($qwe)
32 // -----------------------------------------------
33 function ParseStr($qwe) {
34 global $DEFAULT_QUERY_TYPE;
35 global $DEBUG;
36 global $rus_bool_lang;
37
38 if ($DEBUG) echo "Begin ParseStr(): qwe=$qwe<BR><HR>";
39
40 $qwe=preg_replace("/[\!\@\#\$\%\^\*\-\_\+\=\\\{\}\[\]\;\'\:\"\<\>\?\/\,\.]{1,}/","&",$qwe);
41 $qwe=trim($qwe);
42
43 // query language normalizer
44
45 if ($rus_bool_lang == 'yes') {
46 $qwe=eregi_replace(" {0,}\| {0,}| {1,}or {1,}| {1,}ÉÌÉ {1,}","|",$qwe);
47 $qwe=eregi_replace(" {0,}\& {0,}| {1,}and {1,}| {1,}É {1,}","&",$qwe);
48 $qwe=eregi_replace(" {0,}\~ {0,}| {1,}not {1,}| {1,}without {1,}| {1,}ÎÅ {1,}| {1,}ÂÅÚ {1,}","~",$qwe);
49 } else {
50 $qwe=eregi_replace(" {0,}\| {0,}| {1,}or {1,}","|",$qwe);
51 $qwe=eregi_replace(" {0,}\& {0,}| {1,}and {1,}","&",$qwe);
52 $qwe=eregi_replace(" {0,}\~ {0,}| {1,}not {1,}| {1,}without {1,}","~",$qwe);
53 }
54
55 $qwe=ereg_replace(" {0,}\( {0,}","(",$qwe);
56 $qwe=ereg_replace(" {0,}\) {0,}",")",$qwe);
57
58 // default query type is and
59 if (strtolower($DEFAULT_QUERY_TYPE) == 'or') {
60 $qwe=ereg_replace(" {1,}","|",$qwe);
61 $qwe=ereg_replace("\&\|{1,}","|",$qwe);
62 $qwe=ereg_replace("\|\&{1,}","|",$qwe);
63 } else {
64 $qwe=ereg_replace(" {1,}","&",$qwe);
65 $qwe=ereg_replace("\&\|{1,}","&",$qwe);
66 $qwe=ereg_replace("\|\&{1,}","&",$qwe);
67 }
68
69 // remove unnesessary boolean operators
70 $qwe=ereg_replace("\|{1,}","|",$qwe);
71 $qwe=ereg_replace("&{1,}","&",$qwe);
72 $qwe=ereg_replace("~{1,}","~",$qwe);
73 $qwe=ereg_replace("\|\&\|","&",$qwe);
74 $qwe=ereg_replace("[\|\&\~]{1,}$","",$qwe);
75 $qwe=ereg_replace("^[\|\&]{1,}","",$qwe);
76
77 // transform "w1 ~w2" -> "w1 default_op ~ w2"
78 // ") ~w" -> ") default_op ~w"
79 // "w ~ (" -> "w default_op ~("
80 // ") w" -> ") default_op w"
81 // "w (" -> "w default_op ("
82 // ")(" -> ") default_op ("
83 if (strtolower($DEFAULT_QUERY_TYPE) == 'or') {
84 $qwe=ereg_replace("([^\&\~\|\(\)]+)~([^\&\~\|\(\)]+)","\\1|~\\2",$qwe);
85 $qwe=ereg_replace("\)~{1,}",")|~",$qwe);
86 $qwe=ereg_replace("~{1,}\(","~|(",$qwe);
87 $qwe=ereg_replace("\)([^\&\~\|\(\)]+)",")|\\1",$qwe);
88 $qwe=ereg_replace("([^\&\~\|\(\)]+)\(","\\1|(",$qwe);
89 $qwe=ereg_replace("\) *\(",")|(",$qwe);
90 } else {
91 $qwe=ereg_replace("([^\&\~\|\(\)]+)~([^\&\~\|\(\)]+)","\\1&~\\2",$qwe);
92 $qwe=ereg_replace("\)~{1,}",")&~",$qwe);
93 $qwe=ereg_replace("~{1,}\(","&~(",$qwe);
94 $qwe=ereg_replace("\)([^\&\~\|\(\)]+)",")&\\1",$qwe);
95 $qwe=ereg_replace("([^\&\~\|\(\)]+)\(","\\1&(",$qwe);
96 $qwe=ereg_replace("\) *\(",")&(",$qwe);
97 }
98
99 // remove unnesessary boolean operators
100 $qwe=ereg_replace("\|{1,}","|",$qwe);
101 $qwe=ereg_replace("&{1,}","&",$qwe);
102
103 // remove errornous format of query - ie: '(&', '&)', '(|', '|)', '~&', '~|', '~)'
104 $qwe=ereg_replace("\(\&{1,}","(",$qwe);
105 $qwe=ereg_replace("\&{1,}\)",")",$qwe);
106 $qwe=ereg_replace("\~{1,}\)",")",$qwe);
107 $qwe=ereg_replace("\(\|{1,}","(",$qwe);
108 $qwe=ereg_replace("\|{1,}\)",")",$qwe);
109 $qwe=ereg_replace("\~{1,}\&{1,}","&",$qwe);
110 $qwe=ereg_replace("\~{1,}\|{1,}","|",$qwe);
111
112 if ($DEBUG) echo "End ParseStr(): qwe=$qwe<BR><HR>";
113
114 return($qwe);
115 }
116
117 // -----------------------------------------------
118 // ParseQ($q)
119 // -----------------------------------------------
120 function ParseQ($q){
121 global $DEBUG;
122
123 if ($DEBUG) echo "Begin ParseQ(): q=$q<BR>";
124
125 if (ereg_replace(" ","",$q)==''){
126 return '';
127 }
128
129 $q=ParseStr($q);
130
131 $q=eregi_replace("\&"," && ",$q);
132 $q=eregi_replace("\|"," || ",$q);
133 $q=eregi_replace("\~"," ! ",$q);
134 $q=ereg_replace("\("," ( ",$q);
135 $q=ereg_replace("\)"," ) ",$q);
136 $q="( $q )";
137 $q=ereg_replace(" {1,}"," ",$q);
138
139 if ($DEBUG) echo "End ParseQ(): q=$q<BR>";
140
141 return $q;
142 }
143
144 // -----------------------------------------------
145 // last_parse($q)
146 // -----------------------------------------------
147 function last_parse($q){
148 global $all_words;
149 global $dbtype;
150 global $dbmode;
151 global $db_format;
152 global $ul, $ue, $uestr, $ulstr;
153 global $tagstr, $timestr, $catstr,$langstr;
154 global $DEBUG;
155 global $temp_table;
156 global $final_word;
157 global $ispellmode;
158 global $wordsinfo;
159 global $soundex_suggestions,$soundex;
160
161 global $last_mod_field;
162 global $crc_field;
163
164 $state=0;
165 $qu="";
166 $w="";
167 $n=0;
168
169 $t=strtok($q," ");
170 $words=array();
171
172 if (($dbmode == 'crc') ||
173 ($dbmode == 'crc-multi')) {
174 $word_field = 'word_id';
175 $word_table = 'ndict';
176 $word_type = 'INT';
177 $str_sep = "";
178 } else {
179 $word_field = 'word';
180 $word_table = 'dict';
181 $word_type = 'VARCHAR(32)';
182 $str_sep = "'";
183 }
184
185 while(($t!="")&&($error=="")){
186
187 switch($state){
188 case 0;
189
190 if(($t=="||") ||
191 ($t=="&&") ||
192 ($t==")")) $error="at '$t'";
193
194 else if ($t=="!") {
195 $state=0;
196 $qu="$qu NOT "; break;
197 } else if($t=="("){
198 $n++;
199 $state=0;
200 $qu="$qu(";
201 } else {
202 $state=1;
203 $t=strtolower($t);
204 if (!is_stopword($t)) {
205 normalize_word($t);
206 $words[]=$t;
207 $qu="$qu %$t% ";
208 } else {
209 $qu="$qu 1=1 ";
210 $wordsinfo .= "<b>$t</b>: stopword; ";
211 }
212 }
213 break;
214
215 case 1;
216 if(($t=="||")||($t=="&&")){
217 $state=0;
218 if ($t=='||') $qu="$qu OR ";
219 else $qu="$qu AND ";
220 }
221 else if($t==")") {
222 $n--;
223 $state=1;
224 $qu="$qu)";
225 }
226 else
227 $error=" at '$t'";
228 break;
229 }
230 $t=strtok(" ");
231 }
232
233 if($error=="" && $n != 0) $error='Unmatched brackets';
234 if($error!="") print_error_local($error);
235
236 if (eregi('db',$ispellmode) || eregi('text',$ispellmode)) check_words();
237
238 if (count($words)) $wordsinfo .= "looking for: ";
239
240 for ($i=0; $i<count($words); $i++) {
241 $word=$words[$i];
242 $qu_norm='';
243
244 for ($j=0; $j<count($final_word["$word"]); $j++) {
245 $norm_word=$final_word["$word"][$j];
246 $all_words[]=$norm_word;
247
248 $wordsinfo .="<b>$norm_word</b>; ";
249
250 if (($dbmode == 'crc') ||
251 ($dbmode == 'crc-multi')) {
252 $norm_word_crc[$norm_word]=crc32($norm_word);
253 $norm_word=$norm_word_crc[$norm_word];
254 }
255
256 switch ($dbtype) {
257 case 'mysql' : $qu_norm .= " sum($word_field=$str_sep$norm_word$str_sep)>0 OR "; break;
258 case 'oracle7':
259 case 'oracle8':
260 case 'oracle': $qu_norm .= " sum(decode($word_field,$str_sep$norm_word$str_sep,1,0))>0 OR "; break;
261 case 'pgsql' : $qu_norm .= " sum(case $word_field when $str_sep$norm_word$str_sep then 1 else 0 end)>0 OR "; break;
262 default : $qu_norm .= " sum($word_field=$str_sep$norm_word$str_sep)>0 OR "; break;
263 }
264 if($w=="") $w="$str_sep$norm_word$str_sep"; else $w .=",$str_sep$norm_word$str_sep";
265 }
266
267 $qu_norm="($qu_norm)";
268 $qu_norm=str_replace("OR )",")",$qu_norm);
269 $qu=str_replace("%$word%",$qu_norm,$qu);
270 }
271
272 if ($soundex == 'yes') {
273 for ($i=0; $i<count($all_words); $i++) {
274 $soundex_suggestions .= suggest_soundex($norm_word);
275 }
276 if ($soundex_suggestions == '') $soundex_suggestions = '-';
277 }
278
279 if (($dbtype == 'mysql') || ($dbtype == '')) $sql_small = 'SQL_SMALL_RESULT';
280
281 if (($dbmode == 'multi') ||
282 ($dbmode == 'crc-multi')) {
283 // ------------------
284 // DBMode = multi OR crc-multi
285 $query="";
286
287 if ((($dbtype == 'pgsql') || ($dbtype == 'mysql')) &&
288 (count($words))){
289 $make_temp_table=1;
290 $temp_table = 't'.time().rand(1000,9999).rand(1000,9999);
291 } else {
292 $make_temp_table=0;
293 $temp_table="";
294 }
295
296 for ($i=0; $i<count($words); $i++) {
297 $word=$words[$i];
298 $qu_norm='';
299
300 for ($j=0; $j<count($final_word["$word"]); $j++) {
301 $norm_word=$final_word["$word"][$j];
302
303 $dict=get_dict_tab($norm_word);
304
305 $old_norm_word=$norm_word;
306 if ($dbmode == 'crc-multi') $norm_word=$norm_word_crc[$norm_word];
307
308 if ($query=="") {
309 if ($make_temp_table == 0) {
310 $query="SELECT url_id,$word_field,intag
311 FROM $dict
312 WHERE $word_field = $str_sep$norm_word$str_sep";
313 } else {
314 if ($dbtype == 'mysql') {
315 $query="CREATE /*!32302 TEMPORARY */ TABLE $temp_table (
316 url_id INT DEFAULT '0' NOT NULL,
317 $word_field $word_type DEFAULT '0' NOT NULL,
318 intag TINYINT DEFAULT '0' NOT NULL,
319 KEY i1$temp_table(url_id),
320 KEY i2$temp_table($word_field))";
321
322 if($DEBUG) echo "last_parse(): ",$query,"<BR><HR>";
323 if (!db_query($query)) print_error_local('Query error: '.$query."\n<BR>".db_error());
324
325 $query="INSERT INTO $temp_table
326 SELECT url_id,$word_field,intag
327 FROM $dict
328 WHERE $word_field = $str_sep$norm_word$str_sep";
329
330 if($DEBUG) echo "last_parse(): ",$query,"<BR><HR>";
331 if (!db_query($query)) print_error_local('Query error: '.$query."\n<BR>".db_error());
332
333 $query="SELECT count(*)
334 FROM $temp_table";
335
336 if($DEBUG) echo "last_parse(): ",$query,"<BR><HR>";
337 if (!$res=db_query($query)) print_error_local('Query error: '.$query."\n<BR>".db_error());
338
339 if ($row=db_fetchrow($res)) {
340 $count=$row[0];
341 $wordsinfo .= "<b>$old_norm_word</b>: $count; ";
342 }
343
344 db_freeresult($res);
345 } else {
346 // not mysql
347 $query="SELECT url_id,$word_field,intag
348 INTO TEMP $temp_table
349 FROM $dict
350 WHERE $word_field = $str_sep$norm_word$str_sep";
351 }
352 }
353 } else {
354 if ($dbtype == 'mysql') {
355 $query ="INSERT INTO $temp_table
356 SELECT url_id,$word_field,intag
357 FROM $dict
358 WHERE $word_field = $str_sep$norm_word$str_sep";
359 if($DEBUG) echo "last_parse(): ",$query,"<BR><HR>";
360 if (!db_query($query)) print_error_local('Query error: '.$query."\n<BR>".db_error());
361
362 $query="SELECT count(*)
363 FROM $temp_table";
364
365 if($DEBUG) echo "last_parse(): ",$query,"<BR><HR>";
366 if (!$res=db_query($query)) print_error_local('Query error: '.$query."\n<BR>".db_error());
367
368 if ($row=db_fetchrow($res)) {
369 $count=$row[0]-$count;
370 $wordsinfo .= "<b>$old_norm_word</b>: $count; ";
371 $count=$row[0];
372 }
373
374 db_freeresult($res);
375 } else {
376 $query.="UNION ALL
377 SELECT url_id,$word_field,intag
378 FROM $dict
379 WHERE $word_field = $str_sep$norm_word$str_sep";
380 }
381 }
382 }
383 }
384
385 if (($make_temp_table == 1) && ($dbtype != 'mysql')) {
386 if($DEBUG) echo "last_parse(): ",$query,"<BR><HR>";
387 if (!db_query($query)) print_error_local('Query error: '.$query."\n<BR>".db_error());
388 }
389
390 if($ulstr||$uestr||$tagstr||$timestr||$catstr||$langstr) {
391 if (($dbtype == 'oracle') ||
392 ($dbtype == 'oracle7') ||
393 ($dbtype == 'oracle8')) {
394 $query_url_id="SELECT url_id,
395 sum(intag) as r
396 FROM url, ($query)
397 WHERE url.rec_id=url_id
398 $catstr
399 $ulstr
400 $uestr
401 $langstr
402 $tagstr
403 $timestr
404 GROUP BY url_id
405 HAVING ($qu)
406 ORDER BY r DESC";
407
408 $query_count_url_id="SELECT count(*)
409 FROM (
410 SELECT url_id
411 FROM url, ($query)
412 WHERE url.rec_id=url_id
413 $catstr
414 $ulstr
415 $uestr
416 $langstr
417 $tagstr
418 $timestr
419 GROUP BY url_id
420 HAVING ($qu)
421 )";
422
423 } elseif (($dbtype == 'pgsql') ||
424 ($dbtype == 'mysql')) {
425 $query_url_id="SELECT $sql_small
426 url_id,
427 sum(intag) as r
428 FROM url, $temp_table
429 WHERE url.rec_id=url_id
430 $catstr
431 $ulstr
432 $uestr
433 $langstr
434 $tagstr
435 $timestr
436 GROUP BY url_id
437 HAVING ($qu)
438 ORDER BY r DESC";
439
440 $query_count_url_id="";
441 }
442 } else {
443 // not ul,ue nor tagstr timestr,catstr,langstr
444 if (($dbtype == 'oracle')||
445 ($dbtype == 'oracle7') ||
446 ($dbtype == 'oracle8')) {
447 $query_url_id="SELECT url_id,
448 sum(intag) as r
449 FROM ($query)
450 GROUP BY url_id
451 HAVING ($qu)
452 ORDER BY r DESC";
453
454 $query_count_url_id="SELECT count(*)
455 FROM (
456 SELECT url_id
457 FROM ($query)
458 GROUP BY url_id
459 HAVING ($qu)
460 )";
461 } elseif (($dbtype == 'pgsql') ||
462 ($dbtype == 'mysql')) {
463 $query_url_id="SELECT $sql_small
464 url_id,
465 sum(intag) as r
466 FROM $temp_table
467 GROUP BY url_id
468 HAVING ($qu)
469 ORDER BY r DESC";
470
471 $query_count_url_id="";
472 }
473 }
474
475 } else {
476 // ------------------
477 // DBMode = single or crc
478
479 if($ulstr||$uestr||$tagstr||$timestr||$catstr||$langstr) {
480 $query_url_id="SELECT $sql_small
481 $word_table.url_id,
482 sum($word_table.intag) as r
483 FROM url,$word_table
484 WHERE url.rec_id=$word_table.url_id
485 $catstr
486 $ulstr
487 $uestr
488 $langstr
489 $tagstr
490 $timestr
491 AND $word_table.$word_field in ($w)
492 GROUP BY url_id
493 HAVING ($qu)
494 ORDER BY r DESC";
495
496 $query_count_url_id="SELECT count(*)
497 FROM (
498 SELECT $word_table.url_id
499 FROM url,$word_table
500 WHERE url.rec_id=$word_table.url_id
501 $catstr
502 $ulstr
503 $uestr
504 $langstr
505 $tagstr
506 $timestr
507 AND $word_table.$word_field in ($w)
508 GROUP BY $word_table.url_id
509 HAVING ($qu)
510 )";
511 } else {
512 $query_url_id="SELECT $sql_small
513 $word_table.url_id,
514 sum($word_table.intag) as r
515 FROM $word_table
516 WHERE $word_table.$word_field in ($w)
517 GROUP BY url_id
518 HAVING ($qu)
519 ORDER BY r DESC";
520
521 $query_count_url_id="SELECT count(*)
522 FROM (
523 SELECT $word_table.url_id
524 FROM $word_table
525 WHERE $word_table.$word_field in ($w)
526 GROUP BY $word_table.url_id
527 HAVING ($qu)
528 )";
529 }
530 }
531
532 if ($db_format == '3.1') {
533 $cat_field = ',category';
534 } else {
535 $cat_field = '';
536 }
537
538
539 $query_url="SELECT $sql_small
540 url.url,
541 url.title,
542 url.txt,
543 url.content_type,
544 url.docsize,
545 $last_mod_field,
546 url.keywords,
547 url.description,
548 $crc_field,
549 url.rec_id
550 $cat_field
551 FROM url
552 WHERE url.rec_id = %URL_IN%";
553
554 return array($query_url_id,$query_url,$query_count_url_id);
555 }
556
557 ?>
558

  ViewVC Help
Powered by ViewVC 1.1.26