start.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. <?php
  2. /**
  3. * Elgg search plugin
  4. *
  5. */
  6. elgg_register_event_handler('init','system','search_init');
  7. /**
  8. * Initialize search plugin
  9. */
  10. function search_init() {
  11. global $CONFIG;
  12. require_once 'search_hooks.php';
  13. // page handler for search actions and results
  14. elgg_register_page_handler('search', 'search_page_handler');
  15. // register some default search hooks
  16. elgg_register_plugin_hook_handler('search', 'object', 'search_objects_hook');
  17. elgg_register_plugin_hook_handler('search', 'user', 'search_users_hook');
  18. elgg_register_plugin_hook_handler('search', 'group', 'search_groups_hook');
  19. // tags and comments are a bit different.
  20. // register a search types and a hooks for them.
  21. elgg_register_plugin_hook_handler('search_types', 'get_types', 'search_custom_types_tags_hook');
  22. elgg_register_plugin_hook_handler('search', 'tags', 'search_tags_hook');
  23. // get server min and max allowed chars for ft searching
  24. $CONFIG->search_info = array();
  25. $ft_min_max = search_get_ft_min_max();
  26. $CONFIG->search_info['min_chars'] = $ft_min_max->min;
  27. $CONFIG->search_info['max_chars'] = $ft_min_max->max;
  28. // add in CSS for search elements
  29. elgg_extend_view('css/elgg', 'search/css');
  30. // extend view for elgg topbar search box
  31. elgg_extend_view('page/elements/header', 'search/header');
  32. elgg_register_plugin_hook_handler('robots.txt', 'site', 'search_exclude_robots');
  33. }
  34. /**
  35. * Page handler for search
  36. *
  37. * @param array $page Page elements from core page handler
  38. * @return bool
  39. */
  40. function search_page_handler($page) {
  41. // if there is no q set, we're being called from a legacy installation
  42. // it expects a search by tags.
  43. // actually it doesn't, but maybe it should.
  44. // maintain backward compatibility
  45. if(!get_input('q', get_input('tag', NULL))) {
  46. set_input('q', $page[0]);
  47. //set_input('search_type', 'tags');
  48. }
  49. $base_dir = elgg_get_plugins_path() . 'search/pages/search';
  50. include_once("$base_dir/index.php");
  51. return true;
  52. }
  53. /**
  54. * Return a string with highlighted matched queries and relevant context
  55. * Determines context based upon occurance and distance of words with each other.
  56. *
  57. * @param string $haystack
  58. * @param string $query
  59. * @param int $min_match_context = 30
  60. * @param int $max_length = 300
  61. * @param bool $tag_match Search is for tags. Don't ignore words.
  62. * @return string
  63. */
  64. function search_get_highlighted_relevant_substrings($haystack, $query, $min_match_context = 30, $max_length = 300, $tag_match = false) {
  65. $haystack = strip_tags($haystack);
  66. $haystack_length = elgg_strlen($haystack);
  67. $haystack_lc = elgg_strtolower($haystack);
  68. if (!$tag_match) {
  69. $words = search_remove_ignored_words($query, 'array');
  70. } else {
  71. $words = array();
  72. }
  73. // if haystack < $max_length return the entire haystack w/formatting immediately
  74. if ($haystack_length <= $max_length) {
  75. $return = search_highlight_words($words, $haystack);
  76. return $return;
  77. }
  78. // get the starting positions and lengths for all matching words
  79. $starts = array();
  80. $lengths = array();
  81. foreach ($words as $word) {
  82. $word = elgg_strtolower($word);
  83. $count = elgg_substr_count($haystack_lc, $word);
  84. $word_len = elgg_strlen($word);
  85. $haystack_len = elgg_strlen($haystack_lc);
  86. // find the start positions for the words
  87. if ($count > 1) {
  88. $offset = 0;
  89. while (FALSE !== $pos = elgg_strpos($haystack_lc, $word, $offset)) {
  90. $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
  91. $starts[] = $start;
  92. $stop = $pos + $word_len + $min_match_context;
  93. $lengths[] = $stop - $start;
  94. $offset += $pos + $word_len;
  95. if ($offset >= $haystack_len) {
  96. break;
  97. }
  98. }
  99. } else {
  100. $pos = elgg_strpos($haystack_lc, $word);
  101. $start = ($pos - $min_match_context > 0) ? $pos - $min_match_context : 0;
  102. $starts[] = $start;
  103. $stop = $pos + $word_len + $min_match_context;
  104. $lengths[] = $stop - $start;
  105. }
  106. }
  107. $offsets = search_consolidate_substrings($starts, $lengths);
  108. // figure out if we can adjust the offsets and lengths
  109. // in order to return more context
  110. $total_length = array_sum($offsets);
  111. $add_length = 0;
  112. if ($total_length < $max_length && $offsets) {
  113. $add_length = floor((($max_length - $total_length) / count($offsets)) / 2);
  114. $starts = array();
  115. $lengths = array();
  116. foreach ($offsets as $offset => $length) {
  117. $start = ($offset - $add_length > 0) ? $offset - $add_length : 0;
  118. $length = $length + $add_length;
  119. $starts[] = $start;
  120. $lengths[] = $length;
  121. }
  122. $offsets = search_consolidate_substrings($starts, $lengths);
  123. }
  124. // sort by order of string size descending (which is roughly
  125. // the proximity of matched terms) so we can keep the
  126. // substrings with terms closest together and discard
  127. // the others as needed to fit within $max_length.
  128. arsort($offsets);
  129. $return_strs = array();
  130. $total_length = 0;
  131. foreach ($offsets as $start => $length) {
  132. $string = trim(elgg_substr($haystack, $start, $length));
  133. // continue past if adding this substring exceeds max length
  134. if ($total_length + $length > $max_length) {
  135. continue;
  136. }
  137. $total_length += $length;
  138. $return_strs[$start] = $string;
  139. }
  140. // put the strings in order of occurence
  141. ksort($return_strs);
  142. // add ...s where needed
  143. $return = implode('...', $return_strs);
  144. if (!array_key_exists(0, $return_strs)) {
  145. $return = "...$return";
  146. }
  147. // add to end of string if last substring doesn't hit the end.
  148. $starts = array_keys($return_strs);
  149. $last_pos = $starts[count($starts)-1];
  150. if ($last_pos + elgg_strlen($return_strs[$last_pos]) < $haystack_length) {
  151. $return .= '...';
  152. }
  153. $return = search_highlight_words($words, $return);
  154. return $return;
  155. }
  156. /**
  157. * Takes an array of offsets and lengths and consolidates any
  158. * overlapping entries, returning an array of new offsets and lengths
  159. *
  160. * Offsets and lengths are specified in separate arrays because of possible
  161. * index collisions with the offsets.
  162. *
  163. * @param array $offsets
  164. * @param array $lengths
  165. * @return array
  166. */
  167. function search_consolidate_substrings($offsets, $lengths) {
  168. // sort offsets by occurence
  169. asort($offsets, SORT_NUMERIC);
  170. // reset the indexes maintaining association with the original offsets.
  171. $offsets = array_merge($offsets);
  172. $new_lengths = array();
  173. foreach ($offsets as $i => $offset) {
  174. $new_lengths[] = $lengths[$i];
  175. }
  176. $lengths = $new_lengths;
  177. $return = array();
  178. $count = count($offsets);
  179. for ($i=0; $i<$count; $i++) {
  180. $offset = $offsets[$i];
  181. $length = $lengths[$i];
  182. $end_pos = $offset + $length;
  183. // find the next entry that doesn't overlap
  184. while (array_key_exists($i+1, $offsets) && $end_pos > $offsets[$i+1]) {
  185. $i++;
  186. if (!array_key_exists($i, $offsets)) {
  187. break;
  188. }
  189. $end_pos = $lengths[$i] + $offsets[$i];
  190. }
  191. $length = $end_pos - $offset;
  192. // will never have a colliding offset, so can return as a single array
  193. $return[$offset] = $length;
  194. }
  195. return $return;
  196. }
  197. /**
  198. * Safely highlights the words in $words found in $string avoiding recursion
  199. *
  200. * @param array $words
  201. * @param string $string
  202. * @return string
  203. */
  204. function search_highlight_words($words, $string) {
  205. $i = 1;
  206. $replace_html = array(
  207. 'strong' => rand(10000, 99999),
  208. 'class' => rand(10000, 99999),
  209. 'search-highlight' => rand(10000, 99999),
  210. 'search-highlight-color' => rand(10000, 99999)
  211. );
  212. foreach ($words as $word) {
  213. // remove any boolean mode operators
  214. $word = preg_replace("/([\-\+~])([\w]+)/i", '$2', $word);
  215. // escape the delimiter and any other regexp special chars
  216. $word = preg_quote($word, '/');
  217. $search = "/($word)/i";
  218. // @todo
  219. // must replace with placeholders in case one of the search terms is
  220. // in the html string.
  221. // later, will replace the placeholders with the actual html.
  222. // Yeah this is hacky. I'm tired.
  223. $strong = $replace_html['strong'];
  224. $class = $replace_html['class'];
  225. $highlight = $replace_html['search-highlight'];
  226. $color = $replace_html['search-highlight-color'];
  227. $replace = "<$strong $class=\"$highlight $color{$i}\">$1</$strong>";
  228. $string = preg_replace($search, $replace, $string);
  229. $i++;
  230. }
  231. foreach ($replace_html as $replace => $search) {
  232. $string = str_replace($search, $replace, $string);
  233. }
  234. return $string;
  235. }
  236. /**
  237. * Returns a query with stop and too short words removed.
  238. * (Unless the entire query is < ft_min_word_chars, in which case
  239. * it's taken literally.)
  240. *
  241. * @param array $query
  242. * @param str $format Return as an array or a string
  243. * @return mixed
  244. */
  245. function search_remove_ignored_words($query, $format = 'array') {
  246. global $CONFIG;
  247. // don't worry about "s or boolean operators
  248. //$query = str_replace(array('"', '-', '+', '~'), '', stripslashes(strip_tags($query)));
  249. $query = stripslashes(strip_tags($query));
  250. $query = trim($query);
  251. $words = preg_split('/\s+/', $query);
  252. $min_chars = $CONFIG->search_info['min_chars'];
  253. // if > ft_min_word we're not running in literal mode.
  254. if (elgg_strlen($query) >= $min_chars) {
  255. // clean out any words that are ignored by mysql
  256. foreach ($words as $i => $word) {
  257. if (elgg_strlen($word) < $min_chars) {
  258. unset ($words[$i]);
  259. }
  260. }
  261. }
  262. if ($format == 'string') {
  263. return implode(' ', $words);
  264. }
  265. return $words;
  266. }
  267. /**
  268. * Passes results, and original params to the view functions for
  269. * search type.
  270. *
  271. * @param array $results
  272. * @param array $params
  273. * @param string $view_type = list, entity or layout
  274. * @return string
  275. */
  276. function search_get_search_view($params, $view_type) {
  277. switch ($view_type) {
  278. case 'list':
  279. case 'entity':
  280. case 'layout':
  281. break;
  282. default:
  283. return FALSE;
  284. }
  285. $view_order = array();
  286. // check if there's a special search list view for this type:subtype
  287. if (isset($params['type']) && $params['type'] && isset($params['subtype']) && $params['subtype']) {
  288. $view_order[] = "search/{$params['type']}/{$params['subtype']}/$view_type";
  289. }
  290. // also check for the default type
  291. if (isset($params['type']) && $params['type']) {
  292. $view_order[] = "search/{$params['type']}/$view_type";
  293. }
  294. // check search types
  295. if (isset($params['search_type']) && $params['search_type']) {
  296. $view_order[] = "search/{$params['search_type']}/$view_type";
  297. }
  298. // finally default to a search list default
  299. $view_order[] = "search/$view_type";
  300. foreach ($view_order as $view) {
  301. if (elgg_view_exists($view)) {
  302. return $view;
  303. }
  304. }
  305. return FALSE;
  306. }
  307. /**
  308. * Returns a where clause for a search query.
  309. *
  310. * @param str $table Prefix for table to search on
  311. * @param array $fields Fields to match against
  312. * @param array $params Original search params
  313. * @return str
  314. */
  315. function search_get_where_sql($table, $fields, $params, $use_fulltext = TRUE) {
  316. global $CONFIG;
  317. $query = $params['query'];
  318. // add the table prefix to the fields
  319. foreach ($fields as $i => $field) {
  320. if ($table) {
  321. $fields[$i] = "$table.$field";
  322. }
  323. }
  324. $where = '';
  325. // if query is shorter than the min for fts words
  326. // it's likely a single acronym or similar
  327. // switch to literal mode
  328. if (elgg_strlen($query) < $CONFIG->search_info['min_chars']) {
  329. $likes = array();
  330. $query = sanitise_string($query);
  331. foreach ($fields as $field) {
  332. $likes[] = "$field LIKE '%$query%'";
  333. }
  334. $likes_str = implode(' OR ', $likes);
  335. $where = "($likes_str)";
  336. } else {
  337. // if we're not using full text, rewrite the query for bool mode.
  338. // exploiting a feature(ish) of bool mode where +-word is the same as -word
  339. if (!$use_fulltext) {
  340. $query = '+' . str_replace(' ', ' +', $query);
  341. }
  342. // if using advanced, boolean operators, or paired "s, switch into boolean mode
  343. $booleans_used = preg_match("/([\-\+~])([\w]+)/i", $query);
  344. $advanced_search = (isset($params['advanced_search']) && $params['advanced_search']);
  345. $quotes_used = (elgg_substr_count($query, '"') >= 2);
  346. if (!$use_fulltext || $booleans_used || $advanced_search || $quotes_used) {
  347. $options = 'IN BOOLEAN MODE';
  348. } else {
  349. // natural language mode is default and this keyword isn't supported in < 5.1
  350. //$options = 'IN NATURAL LANGUAGE MODE';
  351. $options = '';
  352. }
  353. // if short query, use query expansion.
  354. // @todo doesn't seem to be working well.
  355. // if (elgg_strlen($query) < 5) {
  356. // $options .= ' WITH QUERY EXPANSION';
  357. // }
  358. $query = sanitise_string($query);
  359. $fields_str = implode(',', $fields);
  360. $where = "(MATCH ($fields_str) AGAINST ('$query' $options))";
  361. }
  362. return $where;
  363. }
  364. /**
  365. * Returns ORDER BY sql for insertion into elgg_get_entities().
  366. *
  367. * @param str $entities_table Prefix for entities table.
  368. * @param str $type_table Prefix for the type table.
  369. * @param str $sort ORDER BY part
  370. * @param str $order ASC or DESC
  371. * @return str
  372. */
  373. function search_get_order_by_sql($entities_table, $type_table, $sort, $order) {
  374. $on = NULL;
  375. switch ($sort) {
  376. default:
  377. case 'relevance':
  378. // default is relevance descending.
  379. // ascending relevancy is silly and complicated.
  380. $on = '';
  381. break;
  382. case 'created':
  383. $on = "$entities_table.time_created";
  384. break;
  385. case 'updated':
  386. $on = "$entities_table.time_updated";
  387. break;
  388. case 'action_on':
  389. // @todo not supported yet in core
  390. $on = '';
  391. break;
  392. case 'alpha':
  393. // @todo not support yet because both title
  394. // and name columns are used for this depending
  395. // on the entity, which we don't always know. >:O
  396. break;
  397. }
  398. $order = strtolower($order);
  399. if ($order != 'asc' && $order != 'desc') {
  400. $order = 'DESC';
  401. }
  402. if ($on) {
  403. $order_by = "$on $order";
  404. } else {
  405. $order_by = '';
  406. }
  407. return $order_by;
  408. }
  409. /**
  410. * Exclude robots from indexing search pages
  411. *
  412. * This is good for performance since search is slow and there are many pages all
  413. * with the same content.
  414. *
  415. * @param string $hook Hook name
  416. * @param string $type Hook type
  417. * @param string $text robots.txt content for plugins
  418. * @return string
  419. */
  420. function search_exclude_robots($hook, $type, $text) {
  421. $text .= <<<TEXT
  422. User-agent: *
  423. Disallow: /search/
  424. TEXT;
  425. return $text;
  426. }
  427. /**
  428. * Returns minimum and maximum lengths of words for MySQL search
  429. * This function looks for stored config values, and, if none set,
  430. * queries the DB and saves them
  431. * @return stdClass An object with min and max properties
  432. */
  433. function search_get_ft_min_max() {
  434. $min = (int) elgg_get_config('search_ft_min_word_len');
  435. $max = (int) elgg_get_config('search_ft_max_word_len');
  436. if (!$min || !$max) {
  437. // defaults from MySQL on Ubuntu Linux
  438. $min = 4;
  439. $max = 90;
  440. try {
  441. $result = get_data_row('SELECT @@ft_min_word_len as min, @@ft_max_word_len as max');
  442. $min = $result->min;
  443. $max = $result->max;
  444. } catch (DatabaseException $e) {
  445. // some servers don't have these values set which leads to exception
  446. // we ignore the exception
  447. }
  448. elgg_save_config('search_ft_min_word_len', $min);
  449. elgg_save_config('search_ft_max_word_len', $max);
  450. }
  451. $ft = new stdClass();
  452. $ft->min = $min;
  453. $ft->max = $max;
  454. return $ft;
  455. }