crawler.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. # vim: set expandtab tabstop=4 shiftwidth=4:
  4. """
  5. This file is part of the XSSer project, https://xsser.03c8.net
  6. Copyright (c) 2010/2021 | psy <epsylon@riseup.net>
  7. xsser is free software; you can redistribute it and/or modify it under
  8. the terms of the GNU General Public License as published by the Free
  9. Software Foundation version 3 of the License.
  10. xsser is distributed in the hope that it will be useful, but WITHOUT ANY
  11. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  12. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  13. details.
  14. You should have received a copy of the GNU General Public License along
  15. with xsser; if not, write to the Free Software Foundation, Inc., 51
  16. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  17. """
  18. import sys
  19. import urllib.request, urllib.parse, urllib.error
  20. import time
  21. import traceback
  22. from . import curlcontrol
  23. from . import threadpool
  24. from queue import Queue
  25. from collections import defaultdict
  26. from bs4 import BeautifulSoup
  27. from bs4.dammit import EncodingDetector
  28. try:
  29. import pycurl
  30. except:
  31. print("\n[Error] Cannot import lib: pycurl. \n\n To install it try:\n\n $ 'sudo apt-get install python3-pycurl' or 'pip3 install pycurl'\n")
  32. sys.exit()
  33. class EmergencyLanding(Exception):
  34. pass
  35. class Crawler(object):
  36. """
  37. Crawler class.
  38. """
  39. def __init__(self, parent, curlwrapper=None, crawled=None, pool=None):
  40. # verbose: 0-no printing, 1-prints dots, 2-prints full output
  41. self.verbose = 0
  42. self._parent = parent
  43. self._to_crawl = []
  44. self._parse_external = True
  45. self._requests = []
  46. self._ownpool = False
  47. self._reporter = None
  48. self._armed = True
  49. self._poolsize = 10
  50. self._found_args = defaultdict(list)
  51. self.pool = pool
  52. if crawled:
  53. self._crawled = crawled
  54. else:
  55. self._crawled = []
  56. if curlwrapper:
  57. self.curl = curlwrapper
  58. else:
  59. self.curl = curlcontrol.Curl
  60. def report(self, msg):
  61. if self._reporter:
  62. self._reporter.report(msg)
  63. else:
  64. print(msg)
  65. def set_reporter(self, reporter):
  66. self._reporter = reporter
  67. def _find_args(self, url):
  68. """
  69. find parameters in given url.
  70. """
  71. parsed = urllib.parse.urlparse(url)
  72. if "C=" in parsed.query and "O=" in parsed.query:
  73. qs = ""
  74. else:
  75. qs = urllib.parse.parse_qs(parsed.query)
  76. if parsed.scheme:
  77. path = parsed.scheme + "://" + parsed.netloc + parsed.path
  78. else:
  79. path = parsed.netloc + parsed.path
  80. for arg_name in qs:
  81. key = (arg_name, parsed.netloc)
  82. zipped = list(zip(*self._found_args[key]))
  83. if not zipped or not path in zipped[0]:
  84. self._found_args[key].append([path, url])
  85. self.generate_result(arg_name, path, url)
  86. if not qs:
  87. parsed = urllib.parse.urlparse(url)
  88. if path.endswith("/"):
  89. attack_url = path + "XSS"
  90. else:
  91. attack_url = path + "/XSS"
  92. if not attack_url in self._parent.crawled_urls:
  93. self._parent.crawled_urls.append(attack_url)
  94. ncurrent = sum([len(s) for s in list(self._found_args.values())])
  95. if ncurrent >= self._max:
  96. self._armed = False
  97. def cancel(self):
  98. self._armed = False
  99. def crawl(self, path, depth=3, width=0, local_only=True):
  100. """
  101. setup and perform a crawl on the given url.
  102. """
  103. if not self._armed:
  104. return []
  105. parsed = urllib.parse.urlparse(path)
  106. basepath = parsed.scheme + "://" + parsed.netloc
  107. self._parse_external = not local_only
  108. if not self.pool:
  109. self.pool = threadpool.ThreadPool(self._poolsize)
  110. if self.verbose == 2:
  111. self.report("crawling: " + path)
  112. if width == 0:
  113. self._max = 1000000000
  114. else:
  115. self._max = int(width)
  116. self._path = path
  117. self._depth = depth
  118. attack_urls = []
  119. if not self._parent._landing and self._armed:
  120. self._crawl(basepath, path, depth, width)
  121. # now parse all found items
  122. if self._ownpool:
  123. self.pool.dismissWorkers(len(self.pool.workers))
  124. self.pool.joinAllDismissedWorkers()
  125. return attack_urls
  126. def shutdown(self):
  127. if self._ownpool:
  128. self.pool.dismissWorkers(len(self.pool.workers))
  129. self.pool.joinAllDismissedWorkers()
  130. def generate_result(self, arg_name, path, url):
  131. parsed = urllib.parse.urlparse(url)
  132. qs = urllib.parse.parse_qs(parsed.query)
  133. qs_joint = {}
  134. for key, val in qs.items():
  135. qs_joint[key] = val[0]
  136. attack_qs = dict(qs_joint)
  137. attack_qs[arg_name] = "XSS"
  138. attack_url = path + '?' + urllib.parse.urlencode(attack_qs)
  139. if not attack_url in self._parent.crawled_urls:
  140. self._parent.crawled_urls.append(attack_url)
  141. def _crawl(self, basepath, path, depth=3, width=0):
  142. """
  143. perform a crawl on the given url.
  144. this function downloads and looks for links.
  145. """
  146. self._crawled.append(path)
  147. if not path.startswith("http"):
  148. return
  149. def _cb(request, result):
  150. self._get_done(depth, width, request, result)
  151. self._requests.append(path)
  152. self.pool.addRequest(self._curl_main, [[path, depth, width, basepath]],
  153. self._get_done_dummy, self._get_error)
  154. def _curl_main(self, pars):
  155. path, depth, width, basepath = pars
  156. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  157. raise EmergencyLanding
  158. c = self.curl()
  159. c.set_timeout(5)
  160. try:
  161. res = c.get(path)
  162. except Exception as error:
  163. c.close()
  164. del c
  165. raise error
  166. c_info = c.info().get('content-type', None)
  167. c.close()
  168. del c
  169. self._get_done(basepath, depth, width, path, res, c_info)
  170. def _get_error(self, request, error):
  171. path, depth, width, basepath = request.args[0]
  172. e_type, e_value, e_tb = error
  173. if e_type == pycurl.error:
  174. errno, message = e_value.args
  175. if errno == 28:
  176. print("requests pyerror -1")
  177. self.enqueue_jobs()
  178. self._requests.remove(path)
  179. return # timeout
  180. else:
  181. self.report('crawler curl error: '+message+' ('+str(errno)+')')
  182. elif e_type == EmergencyLanding:
  183. pass
  184. else:
  185. traceback.print_tb(e_tb)
  186. self.report('crawler error: '+str(e_value)+' '+path)
  187. if not e_type == EmergencyLanding:
  188. for reporter in self._parent._reporters:
  189. reporter.mosquito_crashed(path, str(e_value))
  190. self.enqueue_jobs()
  191. self._requests.remove(path)
  192. def _emergency_parse(self, html_data, start=0):
  193. links = set()
  194. pos = 0
  195. try:
  196. data_len = len(html_data)
  197. except:
  198. data_len = html_data
  199. try:
  200. while pos < data_len:
  201. if len(links)+start > self._max:
  202. break
  203. pos = html_data.find("href=", pos)
  204. if not pos == -1:
  205. sep = html_data[pos+5]
  206. if sep == "h":
  207. pos -= 1
  208. sep=">"
  209. href = html_data[pos+6:html_data.find(sep, pos+7)].split("#")[0]
  210. pos = pos+1
  211. links.add(href)
  212. else:
  213. break
  214. except:
  215. pass
  216. return [{'href': s} for s in links]
  217. def _get_done_dummy(self, request, result):
  218. path = request.args[0][0]
  219. self.enqueue_jobs()
  220. self._requests.remove(path)
  221. def enqueue_jobs(self):
  222. if len(self.pool.workRequests) < int(self._max/2):
  223. while self._to_crawl:
  224. next_job = self._to_crawl.pop()
  225. self._crawl(*next_job)
  226. def _get_done(self, basepath, depth, width, path, html_data, content_type):
  227. if not self._armed or len(self._parent.crawled_urls) >= self._max:
  228. raise EmergencyLanding
  229. try:
  230. encoding = content_type.split(";")[1].split("=")[1].strip()
  231. except:
  232. encoding = None
  233. try:
  234. soup = BeautifulSoup(html_data, 'html.parser')
  235. links = None
  236. except:
  237. soup = None
  238. links = self._emergency_parse(html_data)
  239. for reporter in self._parent._reporters:
  240. reporter.start_crawl(path)
  241. if not links and soup:
  242. links = soup.findAll('a')
  243. forms = soup.findAll('form')
  244. for form in forms:
  245. pars = {}
  246. if "action" in form:
  247. action_path = urllib.parse.urljoin(path, form["action"])
  248. else:
  249. action_path = path
  250. for input_par in form.findAll('input'):
  251. if "name" not in input_par:
  252. continue
  253. value = "foo"
  254. if "value" in input_par and input_par["value"]:
  255. value = input_par["value"]
  256. pars[input_par["name"]] = value
  257. for input_par in form.findAll('select'):
  258. pars[input_par["name"]] = "1"
  259. if pars:
  260. links.append({"url":action_path + '?' + urllib.parse.urlencode(pars)})
  261. else:
  262. links.append({"url":action_path})
  263. links += self._emergency_parse(html_data, len(links))
  264. if self.verbose == 2:
  265. self.report(" "*(self._depth-depth) + path +" "+ str(len(links)))
  266. elif self.verbose:
  267. sys.stdout.write(".")
  268. sys.stdout.flush()
  269. if len(links) > self._max:
  270. links = links[:self._max]
  271. for a in links:
  272. try:
  273. #href = str(a['href'].encode('utf-8'))
  274. href = str(a['href'])
  275. except KeyError:
  276. # this link has no href
  277. continue
  278. except:
  279. # can't decode or something darker..
  280. continue
  281. if href.startswith("javascript") or href.startswith('mailto:'):
  282. continue
  283. href = urllib.parse.urljoin(path, href)
  284. if not href.startswith("http") or not "." in href:
  285. continue
  286. href = href.split('#',1)[0]
  287. scheme_rpos = href.rfind('http://')
  288. if not scheme_rpos in [0, -1]:
  289. # looks like some kind of redirect so we try both too ;)
  290. href1 = href[scheme_rpos:]
  291. href2 = href[:scheme_rpos]
  292. self._check_url(basepath, path, href1, depth, width)
  293. self._check_url(basepath, path, href2, depth, width)
  294. self._check_url(basepath, path, href, depth, width)
  295. return self._found_args
  296. def _check_url(self, basepath, path, href, depth, width):
  297. """
  298. process the given url for a crawl
  299. check to see if we have to continue crawling on the given url.
  300. """
  301. do_crawling = self._parse_external or href.startswith(basepath)
  302. if do_crawling and not href in self._crawled:
  303. self._find_args(href)
  304. for reporter in self._parent._reporters:
  305. reporter.add_link(path, href)
  306. if self._armed and depth>0:
  307. if len(self._to_crawl) < self._max:
  308. self._to_crawl.append([basepath, href, depth-1, width])