crawler.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. # vim: set expandtab tabstop=4 shiftwidth=4:
  4. """
  5. This file is part of the XSSer project, https://xsser.03c8.net
  6. Copyright (c) 2010/2019 | psy <epsylon@riseup.net>
  7. xsser is free software; you can redistribute it and/or modify it under
  8. the terms of the GNU General Public License as published by the Free
  9. Software Foundation version 3 of the License.
  10. xsser is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  13. details.
  14. You should have received a copy of the GNU General Public License along
  15. with xsser; if not, write to the Free Software Foundation, Inc., 51
  16. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. """
  18. import sys
  19. import urllib
  20. import urllib2
  21. import urlparse
  22. import pycurl
  23. import time
  24. import traceback
  25. import curlcontrol
  26. import threadpool
  27. from Queue import Queue
  28. from collections import defaultdict
  29. from BeautifulSoup import BeautifulSoup
  30. class EmergencyLanding(Exception):
  31. pass
  32. class Crawler(object):
  33. """
  34. Crawler class.
  35. """
  36. def __init__(self, parent, curlwrapper=None, crawled=None, pool=None):
  37. # verbose: 0-no printing, 1-prints dots, 2-prints full output
  38. self.verbose = 0
  39. self._parent = parent
  40. self._to_crawl = []
  41. self._parse_external = True
  42. self._requests = []
  43. self._ownpool = False
  44. self._reporter = None
  45. self._armed = True
  46. self._poolsize = 10
  47. self._found_args = defaultdict(list)
  48. self.pool = pool
  49. if crawled:
  50. self._crawled = crawled
  51. else:
  52. self._crawled = []
  53. if curlwrapper:
  54. self.curl = curlwrapper
  55. else:
  56. self.curl = curlcontrol.Curl
  57. def report(self, msg):
  58. if self._reporter:
  59. self._reporter.report(msg)
  60. else:
  61. print msg
  62. def set_reporter(self, reporter):
  63. self._reporter = reporter
  64. def _find_args(self, url):
  65. """
  66. find parameters in given url.
  67. """
  68. parsed = urllib2.urlparse.urlparse(url)
  69. if "C=" in parsed.query and "O=" in parsed.query:
  70. qs = ""
  71. else:
  72. qs = urlparse.parse_qs(parsed.query)
  73. if parsed.scheme:
  74. path = parsed.scheme + "://" + parsed.netloc + parsed.path
  75. else:
  76. path = parsed.netloc + parsed.path
  77. for arg_name in qs:
  78. key = (arg_name, parsed.netloc)
  79. zipped = zip(*self._found_args[key])
  80. if not zipped or not path in zipped[0]:
  81. self._found_args[key].append([path, url])
  82. self.generate_result(arg_name, path, url)
  83. if not qs:
  84. parsed = urllib2.urlparse.urlparse(url)
  85. if path.endswith("/"):
  86. attack_url = path + "XSS"
  87. else:
  88. attack_url = path + "/XSS"
  89. if not attack_url in self._parent.crawled_urls:
  90. self._parent.crawled_urls.append(attack_url)
  91. ncurrent = sum(map(lambda s: len(s), self._found_args.values()))
  92. if ncurrent >= self._max:
  93. self._armed = False
  94. def cancel(self):
  95. self._armed = False
  96. def crawl(self, path, depth=3, width=0, local_only=True):
  97. """
  98. setup and perform a crawl on the given url.
  99. """
  100. if not self._armed:
  101. return []
  102. parsed = urllib2.urlparse.urlparse(path)
  103. basepath = parsed.scheme + "://" + parsed.netloc
  104. self._parse_external = not local_only
  105. if not self.pool:
  106. self.pool = threadpool.ThreadPool(self._poolsize)
  107. if self.verbose == 2:
  108. self.report("crawling: " + path)
  109. if width == 0:
  110. self._max = 1000000000
  111. else:
  112. self._max = int(width)
  113. self._path = path
  114. self._depth = depth
  115. attack_urls = []
  116. if not self._parent._landing and self._armed:
  117. self._crawl(basepath, path, depth, width)
  118. # now parse all found items
  119. if self._ownpool:
  120. self.pool.dismissWorkers(len(self.pool.workers))
  121. self.pool.joinAllDismissedWorkers()
  122. return attack_urls
  123. def shutdown(self):
  124. if self._ownpool:
  125. self.pool.dismissWorkers(len(self.pool.workers))
  126. self.pool.joinAllDismissedWorkers()
  127. def generate_result(self, arg_name, path, url):
  128. parsed = urllib2.urlparse.urlparse(url)
  129. qs = urlparse.parse_qs(parsed.query)
  130. qs_joint = {}
  131. for key, val in qs.iteritems():
  132. qs_joint[key] = val[0]
  133. attack_qs = dict(qs_joint)
  134. attack_qs[arg_name] = "XSS"
  135. attack_url = path + '?' + urllib.urlencode(attack_qs)
  136. if not attack_url in self._parent.crawled_urls:
  137. self._parent.crawled_urls.append(attack_url)
  138. def _crawl(self, basepath, path, depth=3, width=0):
  139. """
  140. perform a crawl on the given url.
  141. this function downloads and looks for links.
  142. """
  143. self._crawled.append(path)
  144. if not path.startswith("http"):
  145. return
  146. def _cb(request, result):
  147. self._get_done(depth, width, request, result)
  148. self._requests.append(path)
  149. self.pool.addRequest(self._curl_main, [[path, depth, width, basepath]],
  150. self._get_done_dummy, self._get_error)
  151. def _curl_main(self, pars):
  152. path, depth, width, basepath = pars
  153. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  154. raise EmergencyLanding
  155. c = self.curl()
  156. c.set_timeout(5)
  157. try:
  158. res = c.get(path)
  159. except Exception as error:
  160. c.close()
  161. del c
  162. raise error
  163. c_info = c.info().get('content-type', None)
  164. c.close()
  165. del c
  166. self._get_done(basepath, depth, width, path, res, c_info)
  167. def _get_error(self, request, error):
  168. path, depth, width, basepath = request.args[0]
  169. e_type, e_value, e_tb = error
  170. if e_type == pycurl.error:
  171. errno, message = e_value.args
  172. if errno == 28:
  173. print("requests pyerror -1")
  174. self.enqueue_jobs()
  175. self._requests.remove(path)
  176. return # timeout
  177. else:
  178. self.report('crawler curl error: '+message+' ('+str(errno)+')')
  179. elif e_type == EmergencyLanding:
  180. pass
  181. else:
  182. traceback.print_tb(e_tb)
  183. self.report('crawler error: '+str(e_value)+' '+path)
  184. if not e_type == EmergencyLanding:
  185. for reporter in self._parent._reporters:
  186. reporter.mosquito_crashed(path, str(e_value))
  187. self.enqueue_jobs()
  188. self._requests.remove(path)
  189. def _emergency_parse(self, html_data, start=0):
  190. links = set()
  191. pos = 0
  192. try:
  193. data_len = len(html_data)
  194. except:
  195. data_len = html_data
  196. while pos < data_len:
  197. if len(links)+start > self._max:
  198. break
  199. pos = html_data.find("href=", pos)
  200. if not pos == -1:
  201. sep = html_data[pos+5]
  202. if sep == "h":
  203. pos -= 1
  204. sep=">"
  205. href = html_data[pos+6:html_data.find(sep, pos+7)].split("#")[0]
  206. pos = pos+1
  207. links.add(href)
  208. else:
  209. break
  210. return map(lambda s: {'href': s}, links)
  211. def _get_done_dummy(self, request, result):
  212. path = request.args[0][0]
  213. self.enqueue_jobs()
  214. self._requests.remove(path)
  215. def enqueue_jobs(self):
  216. if len(self.pool.workRequests) < int(self._max/2):
  217. while self._to_crawl:
  218. next_job = self._to_crawl.pop()
  219. self._crawl(*next_job)
  220. def _get_done(self, basepath, depth, width, path, html_data, content_type):
  221. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  222. raise EmergencyLanding
  223. try:
  224. encoding = content_type.split(";")[1].split("=")[1].strip()
  225. except:
  226. encoding = None
  227. try:
  228. soup = BeautifulSoup(html_data, fromEncoding=encoding)
  229. links = None
  230. except:
  231. soup = None
  232. links = self._emergency_parse(html_data)
  233. for reporter in self._parent._reporters:
  234. reporter.start_crawl(path)
  235. if not links and soup:
  236. links = soup.findAll('a')
  237. forms = soup.findAll('form')
  238. for form in forms:
  239. pars = {}
  240. if form.has_key("action"):
  241. action_path = urlparse.urljoin(path, form["action"])
  242. else:
  243. action_path = path
  244. for input_par in form.findAll('input'):
  245. if not input_par.has_key("name"):
  246. continue
  247. value = "foo"
  248. if input_par.has_key("value") and input_par["value"]:
  249. value = input_par["value"]
  250. pars[input_par["name"]] = value
  251. for input_par in form.findAll('select'):
  252. pars[input_par["name"]] = "1"
  253. if pars:
  254. links.append({"url":action_path + '?' + urllib.urlencode(pars)})
  255. else:
  256. self.report("form with no pars")
  257. links.append({"url":action_path})
  258. links += self._emergency_parse(html_data, len(links))
  259. if self.verbose == 2:
  260. self.report(" "*(self._depth-depth) + path +" "+ str(len(links)))
  261. elif self.verbose:
  262. sys.stdout.write(".")
  263. sys.stdout.flush()
  264. if len(links) > self._max:
  265. links = links[:self._max]
  266. for a in links:
  267. try:
  268. href = str(a['href'].encode('utf-8'))
  269. except KeyError:
  270. # this link has no href
  271. continue
  272. except:
  273. # can't decode or something darker..
  274. continue
  275. if href.startswith("javascript") or href.startswith('mailto:'):
  276. continue
  277. href = urlparse.urljoin(path, href)
  278. if not href.startswith("http") or not "." in href:
  279. continue
  280. href = href.split('#',1)[0]
  281. scheme_rpos = href.rfind('http://')
  282. if not scheme_rpos in [0, -1]:
  283. # looks like some kind of redirect so we try both too ;)
  284. href1 = href[scheme_rpos:]
  285. href2 = href[:scheme_rpos]
  286. self._check_url(basepath, path, href1, depth, width)
  287. self._check_url(basepath, path, href2, depth, width)
  288. self._check_url(basepath, path, href, depth, width)
  289. return self._found_args
  290. def _check_url(self, basepath, path, href, depth, width):
  291. """
  292. process the given url for a crawl
  293. check to see if we have to continue crawling on the given url.
  294. """
  295. do_crawling = self._parse_external or href.startswith(basepath)
  296. if do_crawling and not href in self._crawled:
  297. self._find_args(href)
  298. for reporter in self._parent._reporters:
  299. reporter.add_link(path, href)
  300. if self._armed and depth>0:
  301. if len(self._to_crawl) < self._max:
  302. self._to_crawl.append([basepath, href, depth-1, width])