crawler.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. # vim: set expandtab tabstop=4 shiftwidth=4:
  4. """
  5. $Id$
  6. This file is part of the xsser project, http://xsser.03c8.net
  7. Copyright (c) 2011/2016 psy <epsylon@riseup.net>
  8. xsser is free software; you can redistribute it and/or modify it under
  9. the terms of the GNU General Public License as published by the Free
  10. Software Foundation version 3 of the License.
  11. xsser is distributed in the hope that it will be useful, but WITHOUT ANY
  12. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  13. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  14. details.
  15. You should have received a copy of the GNU General Public License along
  16. with xsser; if not, write to the Free Software Foundation, Inc., 51
  17. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  18. """
  19. import sys
  20. import urllib
  21. import urllib2
  22. import urlparse
  23. import pycurl
  24. import time
  25. import traceback
  26. import curlcontrol
  27. import threadpool
  28. from Queue import Queue
  29. from collections import defaultdict
  30. from BeautifulSoup import BeautifulSoup
  31. class EmergencyLanding(Exception):
  32. pass
  33. class Crawler(object):
  34. """
  35. Crawler class.
  36. Crawls a webpage looking for url arguments.
  37. Dont call from several threads! You should create a new one
  38. for every thread.
  39. """
  40. def __init__(self, parent, curlwrapper=None, crawled=None, pool=None):
  41. # verbose: 0-no printing, 1-prints dots, 2-prints full output
  42. self.verbose = 1
  43. self._parent = parent
  44. self._to_crawl = []
  45. self._parse_external = True
  46. self._requests = []
  47. self._ownpool = False
  48. self._reporter = None
  49. self._armed = True
  50. self._poolsize = 10
  51. self._found_args = defaultdict(list)
  52. self.pool = pool
  53. if crawled:
  54. self._crawled = crawled
  55. else:
  56. self._crawled = []
  57. if curlwrapper:
  58. self.curl = curlwrapper
  59. else:
  60. self.curl = curlcontrol.Curl
  61. def report(self, msg):
  62. if self._reporter:
  63. self._reporter.report(msg)
  64. else:
  65. print msg
  66. def set_reporter(self, reporter):
  67. self._reporter = reporter
  68. def _find_args(self, url):
  69. """
  70. find parameters in given url.
  71. """
  72. parsed = urllib2.urlparse.urlparse(url)
  73. qs = urlparse.parse_qs(parsed.query)
  74. if parsed.scheme:
  75. path = parsed.scheme + "://" + parsed.netloc + parsed.path
  76. else:
  77. path = parsed.netloc + parsed.path
  78. for arg_name in qs:
  79. key = (arg_name, parsed.netloc)
  80. zipped = zip(*self._found_args[key])
  81. if not zipped or not path in zipped[0]:
  82. self._found_args[key].append([path, url])
  83. self.generate_result(arg_name, path, url)
  84. ncurrent = sum(map(lambda s: len(s), self._found_args.values()))
  85. if ncurrent >= self._max:
  86. self._armed = False
  87. def cancel(self):
  88. self._armed = False
  89. def crawl(self, path, depth=3, width=0, local_only=True):
  90. """
  91. setup and perform a crawl on the given url.
  92. """
  93. if not self._armed:
  94. return []
  95. parsed = urllib2.urlparse.urlparse(path)
  96. basepath = parsed.scheme + "://" + parsed.netloc
  97. self._parse_external = not local_only
  98. if not self.pool:
  99. self.pool = threadpool.ThreadPool(self._poolsize)
  100. if self.verbose == 2:
  101. self.report("crawling: " + path)
  102. if width == 0:
  103. self._max = 1000000000
  104. else:
  105. self._max = int(width)
  106. self._path = path
  107. self._depth = depth
  108. attack_urls = []
  109. if not self._parent._landing and self._armed:
  110. self._crawl(basepath, path, depth, width)
  111. if self._ownpool:
  112. self.pool.dismissWorkers(len(self.pool.workers))
  113. self.pool.joinAllDismissedWorkers()
  114. return attack_urls
  115. def shutdown(self):
  116. if self._ownpool:
  117. self.pool.dismissWorkers(len(self.pool.workers))
  118. self.pool.joinAllDismissedWorkers()
  119. def generate_result(self, arg_name, path, url):
  120. parsed = urllib2.urlparse.urlparse(url)
  121. qs = urlparse.parse_qs(parsed.query)
  122. qs_joint = {}
  123. for key, val in qs.iteritems():
  124. qs_joint[key] = val[0]
  125. attack_qs = dict(qs_joint)
  126. attack_qs[arg_name] = "VECTOR"
  127. attack_url = path + '?' + urllib.urlencode(attack_qs)
  128. if not attack_url in self._parent.crawled_urls:
  129. self._parent.crawled_urls.append(attack_url)
  130. def _crawl(self, basepath, path, depth=3, width=0):
  131. """
  132. perform a crawl on the given url.
  133. this function downloads and looks for links.
  134. """
  135. self._crawled.append(path)
  136. if not path.startswith("http"):
  137. return
  138. def _cb(request, result):
  139. self._get_done(depth, width, request, result)
  140. self._requests.append(path)
  141. self.pool.addRequest(self._curl_main, [[path, depth, width, basepath]],
  142. self._get_done_dummy, self._get_error)
  143. def _curl_main(self, pars):
  144. path, depth, width, basepath = pars
  145. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  146. raise EmergencyLanding
  147. c = self.curl()
  148. c.set_timeout(5)
  149. try:
  150. res = c.get(path)
  151. except Exception as error:
  152. c.close()
  153. del c
  154. raise error
  155. c_info = c.info().get('content-type', None)
  156. c.close()
  157. del c
  158. self._get_done(basepath, depth, width, path, res, c_info)
  159. def _get_error(self, request, error):
  160. try:
  161. path, depth, width, basepath = request.args[0]
  162. e_type, e_value, e_tb = error
  163. if e_type == pycurl.error:
  164. errno, message = e_value.args
  165. if errno == 28:
  166. print("requests pyerror -1")
  167. self.enqueue_jobs()
  168. self._requests.remove(path)
  169. return # timeout
  170. else:
  171. self.report('crawler curl error: '+message+' ('+str(errno)+')')
  172. elif e_type == EmergencyLanding:
  173. pass
  174. else:
  175. traceback.print_tb(e_tb)
  176. self.report('crawler error: '+str(e_value)+' '+path)
  177. if not e_type == EmergencyLanding:
  178. for reporter in self._parent._reporters:
  179. reporter.mosquito_crashed(path, str(e_value))
  180. self.enqueue_jobs()
  181. self._requests.remove(path)
  182. except:
  183. return
  184. def _emergency_parse(self, html_data, start=0):
  185. links = set()
  186. pos = 0
  187. if not html_data:
  188. return
  189. data_len = len(html_data)
  190. while pos < data_len:
  191. if len(links)+start > self._max:
  192. break
  193. pos = html_data.find("href=", pos)
  194. if not pos == -1:
  195. sep = html_data[pos+5]
  196. if sep == "h":
  197. pos -= 1
  198. sep=">"
  199. href = html_data[pos+6:html_data.find(sep, pos+7)].split("#")[0]
  200. pos = pos+1
  201. links.add(href)
  202. else:
  203. break
  204. return map(lambda s: {'href': s}, links)
  205. def _get_done_dummy(self, request, result):
  206. path = request.args[0][0]
  207. self.enqueue_jobs()
  208. self._requests.remove(path)
  209. def enqueue_jobs(self):
  210. if len(self.pool.workRequests) < int(self._max/2):
  211. while self._to_crawl:
  212. next_job = self._to_crawl.pop()
  213. self._crawl(*next_job)
  214. def _get_done(self, basepath, depth, width, path, html_data, content_type): # request, result):
  215. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  216. raise EmergencyLanding
  217. try:
  218. encoding = content_type.split(";")[1].split("=")[1].strip()
  219. except:
  220. encoding = None
  221. try:
  222. soup = BeautifulSoup(html_data, from_encoding=encoding)
  223. links = None
  224. except:
  225. soup = None
  226. links = self._emergency_parse(html_data)
  227. for reporter in self._parent._reporters:
  228. reporter.start_crawl(path)
  229. if not links and soup:
  230. links = soup.find_all('a')
  231. forms = soup.find_all('form')
  232. for form in forms:
  233. pars = {}
  234. if form.has_key("action"):
  235. action_path = urlparse.urljoin(path, form["action"])
  236. else:
  237. action_path = path
  238. for input_par in form.find_all('input'):
  239. if not input_par.has_key("name"):
  240. continue
  241. value = "foo"
  242. if input_par.has_key("value") and input_par["value"]:
  243. value = input_par["value"]
  244. pars[input_par["name"]] = value
  245. for input_par in form.findAll('select'):
  246. pars[input_par["name"]] = "1"
  247. if pars:
  248. links.append({"url":action_path + '?' + urllib.urlencode(pars)})
  249. else:
  250. self.report("form with no pars")
  251. links.append({"url":action_path})
  252. links += self._emergency_parse(html_data, len(links))
  253. if self.verbose == 2:
  254. self.report(" "*(self._depth-depth) + path +" "+ str(len(links)))
  255. elif self.verbose:
  256. sys.stdout.write(".")
  257. sys.stdout.flush()
  258. if not links:
  259. return
  260. if len(links) > self._max:
  261. links = links[:self._max]
  262. for a in links:
  263. try:
  264. href = str(a['href'].encode('utf-8'))
  265. except KeyError:
  266. # this link has no href
  267. continue
  268. except:
  269. # can't decode or something darker..
  270. continue
  271. if href.startswith("javascript") or href.startswith('mailto:'):
  272. continue
  273. href = urlparse.urljoin(path, href)
  274. if not href.startswith("http") or not "." in href:
  275. continue
  276. href = href.split('#',1)[0]
  277. scheme_rpos = href.rfind('http://')
  278. if not scheme_rpos in [0, -1]:
  279. # looks like some kind of redirect so we try both too ;)
  280. href1 = href[scheme_rpos:]
  281. href2 = href[:scheme_rpos]
  282. self._check_url(basepath, path, href1, depth, width)
  283. self._check_url(basepath, path, href2, depth, width)
  284. self._check_url(basepath, path, href, depth, width)
  285. return self._found_args
  286. def _check_url(self, basepath, path, href, depth, width):
  287. """
  288. process the given url for a crawl
  289. check to see if we have to continue crawling on the given url.
  290. """
  291. do_crawling = self._parse_external or href.startswith(basepath)
  292. if do_crawling and not href in self._crawled:
  293. self._find_args(href)
  294. for reporter in self._parent._reporters:
  295. reporter.add_link(path, href)
  296. self.report("\n[Info] Spidering: " + str(href))
  297. if self._armed and depth>0:
  298. if len(self._to_crawl) < self._max:
  299. self._to_crawl.append([basepath, href, depth-1, width])