crawler.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. # vim: set expandtab tabstop=4 shiftwidth=4:
  4. """
  5. This file is part of the XSSer project, https://xsser.03c8.net
  6. Copyright (c) 2010/2020 | psy <epsylon@riseup.net>
  7. xsser is free software; you can redistribute it and/or modify it under
  8. the terms of the GNU General Public License as published by the Free
  9. Software Foundation version 3 of the License.
  10. xsser is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  13. details.
  14. You should have received a copy of the GNU General Public License along
  15. with xsser; if not, write to the Free Software Foundation, Inc., 51
  16. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. """
  18. import sys
  19. import urllib.request, urllib.parse, urllib.error
  20. import time
  21. import traceback
  22. from . import curlcontrol
  23. from . import threadpool
  24. from queue import Queue
  25. from collections import defaultdict
  26. from bs4 import BeautifulSoup
  27. from bs4.dammit import EncodingDetector
  28. try:
  29. import pycurl
  30. except:
  31. print("\n[Error] Cannot import lib: pycurl. \n\n To install it try:\n\n $ 'sudo apt-get install python3-pycurl' or 'pip3 install pycurl'\n")
  32. sys.exit()
  33. class EmergencyLanding(Exception):
  34. pass
  35. class Crawler(object):
  36. """
  37. Crawler class.
  38. """
  39. def __init__(self, parent, curlwrapper=None, crawled=None, pool=None):
  40. # verbose: 0-no printing, 1-prints dots, 2-prints full output
  41. self.verbose = 0
  42. self._parent = parent
  43. self._to_crawl = []
  44. self._parse_external = True
  45. self._requests = []
  46. self._ownpool = False
  47. self._reporter = None
  48. self._armed = True
  49. self._poolsize = 10
  50. self._found_args = defaultdict(list)
  51. self.pool = pool
  52. if crawled:
  53. self._crawled = crawled
  54. else:
  55. self._crawled = []
  56. if curlwrapper:
  57. self.curl = curlwrapper
  58. else:
  59. self.curl = curlcontrol.Curl
  60. def report(self, msg):
  61. if self._reporter:
  62. self._reporter.report(msg)
  63. else:
  64. print(msg)
  65. def set_reporter(self, reporter):
  66. self._reporter = reporter
  67. def _find_args(self, url):
  68. """
  69. find parameters in given url.
  70. """
  71. parsed = urllib.parse.urlparse(url)
  72. if "C=" in parsed.query and "O=" in parsed.query:
  73. qs = ""
  74. else:
  75. qs = urllib.parse.parse_qs(parsed.query)
  76. if parsed.scheme:
  77. path = parsed.scheme + "://" + parsed.netloc + parsed.path
  78. else:
  79. path = parsed.netloc + parsed.path
  80. for arg_name in qs:
  81. key = (arg_name, parsed.netloc)
  82. zipped = list(zip(*self._found_args[key]))
  83. if not zipped or not path in zipped[0]:
  84. self._found_args[key].append([path, url])
  85. self.generate_result(arg_name, path, url)
  86. if not qs:
  87. parsed = urllib.parse.urlparse(url)
  88. if path.endswith("/"):
  89. attack_url = path + "XSS"
  90. else:
  91. attack_url = path + "/XSS"
  92. if not attack_url in self._parent.crawled_urls:
  93. self._parent.crawled_urls.append(attack_url)
  94. ncurrent = sum([len(s) for s in list(self._found_args.values())])
  95. if ncurrent >= self._max:
  96. self._armed = False
  97. def cancel(self):
  98. self._armed = False
  99. def crawl(self, path, depth=3, width=0, local_only=True):
  100. """
  101. setup and perform a crawl on the given url.
  102. """
  103. if not self._armed:
  104. return []
  105. parsed = urllib.parse.urlparse(path)
  106. basepath = parsed.scheme + "://" + parsed.netloc
  107. self._parse_external = not local_only
  108. if not self.pool:
  109. self.pool = threadpool.ThreadPool(self._poolsize)
  110. if self.verbose == 2:
  111. self.report("crawling: " + path)
  112. if width == 0:
  113. self._max = 1000000000
  114. else:
  115. self._max = int(width)
  116. self._path = path
  117. self._depth = depth
  118. attack_urls = []
  119. if not self._parent._landing and self._armed:
  120. self._crawl(basepath, path, depth, width)
  121. # now parse all found items
  122. if self._ownpool:
  123. self.pool.dismissWorkers(len(self.pool.workers))
  124. self.pool.joinAllDismissedWorkers()
  125. return attack_urls
  126. def shutdown(self):
  127. if self._ownpool:
  128. self.pool.dismissWorkers(len(self.pool.workers))
  129. self.pool.joinAllDismissedWorkers()
  130. def generate_result(self, arg_name, path, url):
  131. parsed = urllib.parse.urlparse(url)
  132. qs = urllib.parse.parse_qs(parsed.query)
  133. qs_joint = {}
  134. for key, val in qs.items():
  135. qs_joint[key] = val[0]
  136. attack_qs = dict(qs_joint)
  137. attack_qs[arg_name] = "XSS"
  138. attack_url = path + '?' + urllib.parse.urlencode(attack_qs)
  139. if not attack_url in self._parent.crawled_urls:
  140. self._parent.crawled_urls.append(attack_url)
  141. def _crawl(self, basepath, path, depth=3, width=0):
  142. """
  143. perform a crawl on the given url.
  144. this function downloads and looks for links.
  145. """
  146. self._crawled.append(path)
  147. if not path.startswith("http"):
  148. return
  149. def _cb(request, result):
  150. self._get_done(depth, width, request, result)
  151. self._requests.append(path)
  152. self.pool.addRequest(self._curl_main, [[path, depth, width, basepath]],
  153. self._get_done_dummy, self._get_error)
  154. def _curl_main(self, pars):
  155. path, depth, width, basepath = pars
  156. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  157. raise EmergencyLanding
  158. c = self.curl()
  159. c.set_timeout(5)
  160. try:
  161. res = c.get(path)
  162. except Exception as error:
  163. c.close()
  164. del c
  165. raise error
  166. c_info = c.info().get('content-type', None)
  167. c.close()
  168. del c
  169. self._get_done(basepath, depth, width, path, res, c_info)
  170. def _get_error(self, request, error):
  171. path, depth, width, basepath = request.args[0]
  172. e_type, e_value, e_tb = error
  173. if e_type == pycurl.error:
  174. errno, message = e_value.args
  175. if errno == 28:
  176. print("requests pyerror -1")
  177. self.enqueue_jobs()
  178. self._requests.remove(path)
  179. return # timeout
  180. else:
  181. self.report('crawler curl error: '+message+' ('+str(errno)+')')
  182. elif e_type == EmergencyLanding:
  183. pass
  184. else:
  185. traceback.print_tb(e_tb)
  186. self.report('crawler error: '+str(e_value)+' '+path)
  187. if not e_type == EmergencyLanding:
  188. for reporter in self._parent._reporters:
  189. reporter.mosquito_crashed(path, str(e_value))
  190. self.enqueue_jobs()
  191. self._requests.remove(path)
  192. def _emergency_parse(self, html_data, start=0):
  193. links = set()
  194. pos = 0
  195. try:
  196. data_len = len(html_data)
  197. except:
  198. data_len = html_data
  199. while pos < data_len:
  200. if len(links)+start > self._max:
  201. break
  202. pos = html_data.find("href=", pos)
  203. if not pos == -1:
  204. sep = html_data[pos+5]
  205. if sep == "h":
  206. pos -= 1
  207. sep=">"
  208. href = html_data[pos+6:html_data.find(sep, pos+7)].split("#")[0]
  209. pos = pos+1
  210. links.add(href)
  211. else:
  212. break
  213. return [{'href': s} for s in links]
  214. def _get_done_dummy(self, request, result):
  215. path = request.args[0][0]
  216. self.enqueue_jobs()
  217. self._requests.remove(path)
  218. def enqueue_jobs(self):
  219. if len(self.pool.workRequests) < int(self._max/2):
  220. while self._to_crawl:
  221. next_job = self._to_crawl.pop()
  222. self._crawl(*next_job)
  223. def _get_done(self, basepath, depth, width, path, html_data, content_type):
  224. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  225. raise EmergencyLanding
  226. try:
  227. encoding = content_type.split(";")[1].split("=")[1].strip()
  228. except:
  229. encoding = None
  230. try:
  231. soup = BeautifulSoup(html_data, 'html.parser')
  232. links = None
  233. except:
  234. soup = None
  235. links = self._emergency_parse(html_data)
  236. for reporter in self._parent._reporters:
  237. reporter.start_crawl(path)
  238. if not links and soup:
  239. links = soup.findAll('a')
  240. forms = soup.findAll('form')
  241. for form in forms:
  242. pars = {}
  243. if "action" in form:
  244. action_path = urllib.parse.urljoin(path, form["action"])
  245. else:
  246. action_path = path
  247. for input_par in form.findAll('input'):
  248. if "name" not in input_par:
  249. continue
  250. value = "foo"
  251. if "value" in input_par and input_par["value"]:
  252. value = input_par["value"]
  253. pars[input_par["name"]] = value
  254. for input_par in form.findAll('select'):
  255. pars[input_par["name"]] = "1"
  256. if pars:
  257. links.append({"url":action_path + '?' + urllib.parse.urlencode(pars)})
  258. else:
  259. self.report("form with no pars")
  260. links.append({"url":action_path})
  261. links += self._emergency_parse(html_data, len(links))
  262. if self.verbose == 2:
  263. self.report(" "*(self._depth-depth) + path +" "+ str(len(links)))
  264. elif self.verbose:
  265. sys.stdout.write(".")
  266. sys.stdout.flush()
  267. if len(links) > self._max:
  268. links = links[:self._max]
  269. for a in links:
  270. try:
  271. #href = str(a['href'].encode('utf-8'))
  272. href = str(a['href'])
  273. except KeyError:
  274. # this link has no href
  275. continue
  276. except:
  277. # can't decode or something darker..
  278. continue
  279. if href.startswith("javascript") or href.startswith('mailto:'):
  280. continue
  281. href = urllib.parse.urljoin(path, href)
  282. if not href.startswith("http") or not "." in href:
  283. continue
  284. href = href.split('#',1)[0]
  285. scheme_rpos = href.rfind('http://')
  286. if not scheme_rpos in [0, -1]:
  287. # looks like some kind of redirect so we try both too ;)
  288. href1 = href[scheme_rpos:]
  289. href2 = href[:scheme_rpos]
  290. self._check_url(basepath, path, href1, depth, width)
  291. self._check_url(basepath, path, href2, depth, width)
  292. self._check_url(basepath, path, href, depth, width)
  293. return self._found_args
  294. def _check_url(self, basepath, path, href, depth, width):
  295. """
  296. process the given url for a crawl
  297. check to see if we have to continue crawling on the given url.
  298. """
  299. do_crawling = self._parse_external or href.startswith(basepath)
  300. if do_crawling and not href in self._crawled:
  301. self._find_args(href)
  302. for reporter in self._parent._reporters:
  303. reporter.add_link(path, href)
  304. if self._armed and depth>0:
  305. if len(self._to_crawl) < self._max:
  306. self._to_crawl.append([basepath, href, depth-1, width])