crawler.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. # vim: set expandtab tabstop=4 shiftwidth=4:
  4. """
  5. This file is part of the XSSer project, https://xsser.03c8.net
  6. Copyright (c) 2010/2019 | psy <epsylon@riseup.net>
  7. xsser is free software; you can redistribute it and/or modify it under
  8. the terms of the GNU General Public License as published by the Free
  9. Software Foundation version 3 of the License.
  10. xsser is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  13. details.
  14. You should have received a copy of the GNU General Public License along
  15. with xsser; if not, write to the Free Software Foundation, Inc., 51
  16. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. """
  18. import sys
  19. import urllib.request, urllib.parse, urllib.error
  20. import pycurl
  21. import time
  22. import traceback
  23. from . import curlcontrol
  24. from . import threadpool
  25. from queue import Queue
  26. from collections import defaultdict
  27. from bs4 import BeautifulSoup
  28. from bs4.dammit import EncodingDetector
  29. class EmergencyLanding(Exception):
  30. pass
  31. class Crawler(object):
  32. """
  33. Crawler class.
  34. """
  35. def __init__(self, parent, curlwrapper=None, crawled=None, pool=None):
  36. # verbose: 0-no printing, 1-prints dots, 2-prints full output
  37. self.verbose = 0
  38. self._parent = parent
  39. self._to_crawl = []
  40. self._parse_external = True
  41. self._requests = []
  42. self._ownpool = False
  43. self._reporter = None
  44. self._armed = True
  45. self._poolsize = 10
  46. self._found_args = defaultdict(list)
  47. self.pool = pool
  48. if crawled:
  49. self._crawled = crawled
  50. else:
  51. self._crawled = []
  52. if curlwrapper:
  53. self.curl = curlwrapper
  54. else:
  55. self.curl = curlcontrol.Curl
  56. def report(self, msg):
  57. if self._reporter:
  58. self._reporter.report(msg)
  59. else:
  60. print(msg)
  61. def set_reporter(self, reporter):
  62. self._reporter = reporter
  63. def _find_args(self, url):
  64. """
  65. find parameters in given url.
  66. """
  67. parsed = urllib.parse.urlparse(url)
  68. if "C=" in parsed.query and "O=" in parsed.query:
  69. qs = ""
  70. else:
  71. qs = urllib.parse.parse_qs(parsed.query)
  72. if parsed.scheme:
  73. path = parsed.scheme + "://" + parsed.netloc + parsed.path
  74. else:
  75. path = parsed.netloc + parsed.path
  76. for arg_name in qs:
  77. key = (arg_name, parsed.netloc)
  78. zipped = list(zip(*self._found_args[key]))
  79. if not zipped or not path in zipped[0]:
  80. self._found_args[key].append([path, url])
  81. self.generate_result(arg_name, path, url)
  82. if not qs:
  83. parsed = urllib.parse.urlparse(url)
  84. if path.endswith("/"):
  85. attack_url = path + "XSS"
  86. else:
  87. attack_url = path + "/XSS"
  88. if not attack_url in self._parent.crawled_urls:
  89. self._parent.crawled_urls.append(attack_url)
  90. ncurrent = sum([len(s) for s in list(self._found_args.values())])
  91. if ncurrent >= self._max:
  92. self._armed = False
  93. def cancel(self):
  94. self._armed = False
  95. def crawl(self, path, depth=3, width=0, local_only=True):
  96. """
  97. setup and perform a crawl on the given url.
  98. """
  99. if not self._armed:
  100. return []
  101. parsed = urllib.parse.urlparse(path)
  102. basepath = parsed.scheme + "://" + parsed.netloc
  103. self._parse_external = not local_only
  104. if not self.pool:
  105. self.pool = threadpool.ThreadPool(self._poolsize)
  106. if self.verbose == 2:
  107. self.report("crawling: " + path)
  108. if width == 0:
  109. self._max = 1000000000
  110. else:
  111. self._max = int(width)
  112. self._path = path
  113. self._depth = depth
  114. attack_urls = []
  115. if not self._parent._landing and self._armed:
  116. self._crawl(basepath, path, depth, width)
  117. # now parse all found items
  118. if self._ownpool:
  119. self.pool.dismissWorkers(len(self.pool.workers))
  120. self.pool.joinAllDismissedWorkers()
  121. return attack_urls
  122. def shutdown(self):
  123. if self._ownpool:
  124. self.pool.dismissWorkers(len(self.pool.workers))
  125. self.pool.joinAllDismissedWorkers()
  126. def generate_result(self, arg_name, path, url):
  127. parsed = urllib.parse.urlparse(url)
  128. qs = urllib.parse.parse_qs(parsed.query)
  129. qs_joint = {}
  130. for key, val in qs.items():
  131. qs_joint[key] = val[0]
  132. attack_qs = dict(qs_joint)
  133. attack_qs[arg_name] = "XSS"
  134. attack_url = path + '?' + urllib.parse.urlencode(attack_qs)
  135. if not attack_url in self._parent.crawled_urls:
  136. self._parent.crawled_urls.append(attack_url)
  137. def _crawl(self, basepath, path, depth=3, width=0):
  138. """
  139. perform a crawl on the given url.
  140. this function downloads and looks for links.
  141. """
  142. self._crawled.append(path)
  143. if not path.startswith("http"):
  144. return
  145. def _cb(request, result):
  146. self._get_done(depth, width, request, result)
  147. self._requests.append(path)
  148. self.pool.addRequest(self._curl_main, [[path, depth, width, basepath]],
  149. self._get_done_dummy, self._get_error)
  150. def _curl_main(self, pars):
  151. path, depth, width, basepath = pars
  152. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  153. raise EmergencyLanding
  154. c = self.curl()
  155. c.set_timeout(5)
  156. try:
  157. res = c.get(path)
  158. except Exception as error:
  159. c.close()
  160. del c
  161. raise error
  162. c_info = c.info().get('content-type', None)
  163. c.close()
  164. del c
  165. self._get_done(basepath, depth, width, path, res, c_info)
  166. def _get_error(self, request, error):
  167. path, depth, width, basepath = request.args[0]
  168. e_type, e_value, e_tb = error
  169. if e_type == pycurl.error:
  170. errno, message = e_value.args
  171. if errno == 28:
  172. print("requests pyerror -1")
  173. self.enqueue_jobs()
  174. self._requests.remove(path)
  175. return # timeout
  176. else:
  177. self.report('crawler curl error: '+message+' ('+str(errno)+')')
  178. elif e_type == EmergencyLanding:
  179. pass
  180. else:
  181. traceback.print_tb(e_tb)
  182. self.report('crawler error: '+str(e_value)+' '+path)
  183. if not e_type == EmergencyLanding:
  184. for reporter in self._parent._reporters:
  185. reporter.mosquito_crashed(path, str(e_value))
  186. self.enqueue_jobs()
  187. self._requests.remove(path)
  188. def _emergency_parse(self, html_data, start=0):
  189. links = set()
  190. pos = 0
  191. try:
  192. data_len = len(html_data)
  193. except:
  194. data_len = html_data
  195. while pos < data_len:
  196. if len(links)+start > self._max:
  197. break
  198. pos = html_data.find("href=", pos)
  199. if not pos == -1:
  200. sep = html_data[pos+5]
  201. if sep == "h":
  202. pos -= 1
  203. sep=">"
  204. href = html_data[pos+6:html_data.find(sep, pos+7)].split("#")[0]
  205. pos = pos+1
  206. links.add(href)
  207. else:
  208. break
  209. return [{'href': s} for s in links]
  210. def _get_done_dummy(self, request, result):
  211. path = request.args[0][0]
  212. self.enqueue_jobs()
  213. self._requests.remove(path)
  214. def enqueue_jobs(self):
  215. if len(self.pool.workRequests) < int(self._max/2):
  216. while self._to_crawl:
  217. next_job = self._to_crawl.pop()
  218. self._crawl(*next_job)
  219. def _get_done(self, basepath, depth, width, path, html_data, content_type):
  220. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  221. raise EmergencyLanding
  222. try:
  223. encoding = content_type.split(";")[1].split("=")[1].strip()
  224. except:
  225. encoding = None
  226. try:
  227. soup = BeautifulSoup(html_data, 'html.parser')
  228. links = None
  229. except:
  230. soup = None
  231. links = self._emergency_parse(html_data)
  232. for reporter in self._parent._reporters:
  233. reporter.start_crawl(path)
  234. if not links and soup:
  235. links = soup.findAll('a')
  236. forms = soup.findAll('form')
  237. for form in forms:
  238. pars = {}
  239. if "action" in form:
  240. action_path = urllib.parse.urljoin(path, form["action"])
  241. else:
  242. action_path = path
  243. for input_par in form.findAll('input'):
  244. if "name" not in input_par:
  245. continue
  246. value = "foo"
  247. if "value" in input_par and input_par["value"]:
  248. value = input_par["value"]
  249. pars[input_par["name"]] = value
  250. for input_par in form.findAll('select'):
  251. pars[input_par["name"]] = "1"
  252. if pars:
  253. links.append({"url":action_path + '?' + urllib.parse.urlencode(pars)})
  254. else:
  255. self.report("form with no pars")
  256. links.append({"url":action_path})
  257. links += self._emergency_parse(html_data, len(links))
  258. if self.verbose == 2:
  259. self.report(" "*(self._depth-depth) + path +" "+ str(len(links)))
  260. elif self.verbose:
  261. sys.stdout.write(".")
  262. sys.stdout.flush()
  263. if len(links) > self._max:
  264. links = links[:self._max]
  265. for a in links:
  266. try:
  267. #href = str(a['href'].encode('utf-8'))
  268. href = str(a['href'])
  269. except KeyError:
  270. # this link has no href
  271. continue
  272. except:
  273. # can't decode or something darker..
  274. continue
  275. if href.startswith("javascript") or href.startswith('mailto:'):
  276. continue
  277. href = urllib.parse.urljoin(path, href)
  278. if not href.startswith("http") or not "." in href:
  279. continue
  280. href = href.split('#',1)[0]
  281. scheme_rpos = href.rfind('http://')
  282. if not scheme_rpos in [0, -1]:
  283. # looks like some kind of redirect so we try both too ;)
  284. href1 = href[scheme_rpos:]
  285. href2 = href[:scheme_rpos]
  286. self._check_url(basepath, path, href1, depth, width)
  287. self._check_url(basepath, path, href2, depth, width)
  288. self._check_url(basepath, path, href, depth, width)
  289. return self._found_args
  290. def _check_url(self, basepath, path, href, depth, width):
  291. """
  292. process the given url for a crawl
  293. check to see if we have to continue crawling on the given url.
  294. """
  295. do_crawling = self._parse_external or href.startswith(basepath)
  296. if do_crawling and not href in self._crawled:
  297. self._find_args(href)
  298. for reporter in self._parent._reporters:
  299. reporter.add_link(path, href)
  300. if self._armed and depth>0:
  301. if len(self._to_crawl) < self._max:
  302. self._to_crawl.append([basepath, href, depth-1, width])