fag protection
This commit is contained in:
		
							parent
							
								
									81502d4721
								
							
						
					
					
						commit
						2976c0a6a4
					
				| @ -23,17 +23,13 @@ class config{ | |||||||
| 	// Enable the API?
 | 	// Enable the API?
 | ||||||
| 	const API_ENABLED = true; | 	const API_ENABLED = true; | ||||||
| 	 | 	 | ||||||
| 	// Bot protection
 | 	//
 | ||||||
| 	// 4get.ca has been hit with 500k bot reqs every single day for months
 | 	// BOT PROTECTION
 | ||||||
| 	// you probably want to enable this if your instance is public...
 | 	//
 | ||||||
| 	// 0 = disabled
 |  | ||||||
| 	// 1 = ask for image captcha (requires imagemagick v6 or higher)
 |  | ||||||
| 	// @TODO: 2 = invite only (users needs a pass)
 |  | ||||||
| 	const BOT_PROTECTION = 0; |  | ||||||
| 	 | 	 | ||||||
| 	// Maximal number of searches per captcha key/pass issued. Counter gets
 | 	// 0 = disabled, 1 = ask for image captcha, @TODO: 2 = invite only (users needs a pass)
 | ||||||
| 	// reset on every APCU cache clear (should happen once a day)
 | 	// VERY useful against a targetted attack
 | ||||||
| 	const MAX_SEARCHES = 100; | 	const BOT_PROTECTION = 0; | ||||||
| 	 | 	 | ||||||
| 	// if BOT_PROTECTION is set to 1, specify the available datasets here
 | 	// if BOT_PROTECTION is set to 1, specify the available datasets here
 | ||||||
| 	// images should be named from 1.png to X.png, and be 100x100 in size
 | 	// images should be named from 1.png to X.png, and be 100x100 in size
 | ||||||
| @ -45,6 +41,32 @@ class config{ | |||||||
| 		// ["minecraft", 848]
 | 		// ["minecraft", 848]
 | ||||||
| 	]; | 	]; | ||||||
| 	 | 	 | ||||||
|  | 	// If this regex expression matches on the user agent, it blocks the request
 | ||||||
|  | 	// Not useful at all against a targetted attack
 | ||||||
|  | 	const HEADER_REGEX = '/bot|wget|curl|python-requests|scrapy|go-http-client|ruby|yahoo|spider/i'; | ||||||
|  | 	 | ||||||
|  | 	// Block clients who present any of the following headers in their request (SPECIFY IN !!lowercase!!)
 | ||||||
|  | 	// Eg: ["x-forwarded-for", "x-via", "forwarded-for", "via"];
 | ||||||
|  | 	// Useful for blocking *some* proxies used for botting
 | ||||||
|  | 	const FILTERED_HEADER_KEYS = [ | ||||||
|  | 		"x-forwarded-for", | ||||||
|  | 		"x-via", | ||||||
|  | 		"forwarded-for", | ||||||
|  | 		"via" | ||||||
|  | 	]; | ||||||
|  | 	 | ||||||
|  | 	// @TODO: Portscan the user for open proxies before allowing a connection, block user if any are found
 | ||||||
|  | 	// Requires the nmap package
 | ||||||
|  | 	const NMAP_PROXY_CHECK = false; | ||||||
|  | 	 | ||||||
|  | 	// @TODO: Make IP blacklist public under /api/v1/blacklist endpoint ?
 | ||||||
|  | 	const PUBLIC_IP_BLACKLIST = true; | ||||||
|  | 	 | ||||||
|  | 	// Maximal number of searches per captcha key/pass issued. Counter gets
 | ||||||
|  | 	// reset on every APCU cache clear (should happen once a day).
 | ||||||
|  | 	// Only useful when BOT_PROTECTION is NOT set to 0
 | ||||||
|  | 	const MAX_SEARCHES = 100; | ||||||
|  | 	 | ||||||
| 	// List of domains that point to your servers. Include your tor/i2p
 | 	// List of domains that point to your servers. Include your tor/i2p
 | ||||||
| 	// addresses here! Must be a valid URL. Won't affect links placed on
 | 	// addresses here! Must be a valid URL. Won't affect links placed on
 | ||||||
| 	// the homepage.
 | 	// the homepage.
 | ||||||
|  | |||||||
| @ -8,6 +8,9 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel | |||||||
| 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` | 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` | ||||||
| 4. The captcha font is located in `data/fonts/captcha.ttf` | 4. The captcha font is located in `data/fonts/captcha.ttf` | ||||||
| 
 | 
 | ||||||
|  | ## Robots.txt | ||||||
|  | Make sure you configure this right to optimize your search engine presence! Head over to `/robots.txt` and change the 4get.ca domain to your own domain. | ||||||
|  | 
 | ||||||
| ## Server listing | ## Server listing | ||||||
| To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it. | To be listed on https://4get.ca/instances , you must contact *any* of the people in the server list and ask them to add you to their list of instances in their configuration. The instance list is distributed, and I don't have control over it. | ||||||
| 
 | 
 | ||||||
| @ -32,4 +35,4 @@ If you see spammy entries in your instances list, simply remove the instance fro | |||||||
| Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection! | Done! The scraper you chose should now be using the rotating proxies. When asking for the next page of results, it will use the same proxy to avoid detection! | ||||||
| 
 | 
 | ||||||
| ### Important! | ### Important! | ||||||
| If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. | If you ever test out a `socks5` proxy locally on your machine and find out it works but doesn't on your server, try supplying the `socks5_hostname` protocol instead. Hopefully this tip can save you 3 hours of your life! | ||||||
|  | |||||||
| @ -29,7 +29,7 @@ try{ | |||||||
| 	 | 	 | ||||||
| }catch(Exception $error){ | }catch(Exception $error){ | ||||||
| 	 | 	 | ||||||
| 	$frontend->drawscrapererror($error->getMessage(), $get, "images"); | 	$frontend->drawscrapererror($error->getMessage(), $get, "images", $payload["timetaken"]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| if(count($results["image"]) === 0){ | if(count($results["image"]) === 0){ | ||||||
|  | |||||||
| @ -32,6 +32,8 @@ class backend{ | |||||||
| 		 | 		 | ||||||
| 		$proxylist = array_values($proxylist); | 		$proxylist = array_values($proxylist); | ||||||
| 		 | 		 | ||||||
|  | 		echo $proxy_index_raw % count($proxylist); | ||||||
|  | 		 | ||||||
| 		return $proxylist[$proxy_index_raw % count($proxylist)]; | 		return $proxylist[$proxy_index_raw % count($proxylist)]; | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
|  | |||||||
| @ -44,7 +44,7 @@ class frontend{ | |||||||
| 			$replacements["timetaken"] !== null | 			$replacements["timetaken"] !== null | ||||||
| 		){ | 		){ | ||||||
| 			 | 			 | ||||||
| 			$replacements["timetaken"] = '<div class="timetaken">Took ' . substr(microtime(true) - $replacements["timetaken"], 0, 4) . 's</div>'; | 			$replacements["timetaken"] = '<div class="timetaken">Took ' . number_format(microtime(true) - $replacements["timetaken"], 2) . 's</div>'; | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
| 		$handle = fopen("template/{$template}", "r"); | 		$handle = fopen("template/{$template}", "r"); | ||||||
| @ -84,29 +84,54 @@ class frontend{ | |||||||
| 				"filters" => $this->generatehtmlfilters($filters, $get) | 				"filters" => $this->generatehtmlfilters($filters, $get) | ||||||
| 			]); | 			]); | ||||||
| 		 | 		 | ||||||
|  | 		$headers_raw = getallheaders(); | ||||||
|  | 		$header_keys = []; | ||||||
|  | 		$user_agent = ""; | ||||||
|  | 		$bad_header = false; | ||||||
|  | 		 | ||||||
|  | 		foreach($headers_raw as $headerkey => $headervalue){ | ||||||
|  | 			 | ||||||
|  | 			$headerkey = strtolower($headerkey); | ||||||
|  | 			if($headerkey == "user-agent"){ | ||||||
|  | 				 | ||||||
|  | 				$user_agent = $headervalue; | ||||||
|  | 				continue; | ||||||
|  | 			} | ||||||
|  | 			 | ||||||
|  | 			// check header key
 | ||||||
|  | 			if(in_array($headerkey, config::FILTERED_HEADER_KEYS)){ | ||||||
|  | 				 | ||||||
|  | 				$bad_header = true; | ||||||
|  | 				break; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
| 		if( | 		if( | ||||||
| 			preg_match( | 			preg_match( | ||||||
| 				'/bot|wget|curl|python-requests|scrapy|feedfetcher|go-http-client|ruby|universalfeedparser|yahoo\! slurp|spider|rss/i', | 				config::HEADER_REGEX, | ||||||
| 				$_SERVER["HTTP_USER_AGENT"] | 				$user_agent | ||||||
| 			) | 			) || | ||||||
|  | 			$bad_header === true | ||||||
| 		){ | 		){ | ||||||
| 			 | 			 | ||||||
| 			// bot detected !!
 | 			// bot detected !!
 | ||||||
| 			apcu_inc("captcha_gen"); | 			apcu_inc("captcha_gen"); | ||||||
| 			 | 			 | ||||||
|  | 			$null = null; | ||||||
| 			$this->drawerror( | 			$this->drawerror( | ||||||
| 				"Tshh, blocked!", | 				"Tshh, blocked!", | ||||||
| 				'You were blocked from viewing this page. If you wish to scrape data from 4get, please consider running <a href="https://git.lolcat.ca/lolcat/4get" rel="noreferrer nofollow">your own 4get instance</a>.', | 				'Your browser, IP or IP range has been blocked from this 4get instance. If this is an error, please <a href="/about">contact the administrator</a>.', | ||||||
|  | 				microtime(true) | ||||||
| 			); | 			); | ||||||
| 			die(); | 			die(); | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
| 	public function drawerror($title, $error){ | 	public function drawerror($title, $error, $timetaken){ | ||||||
| 		 | 		 | ||||||
| 		echo | 		echo | ||||||
| 			$this->load("search.html", [ | 			$this->load("search.html", [ | ||||||
| 				"timetaken" => null, | 				"timetaken" => $timetaken, | ||||||
| 				"class" => "", | 				"class" => "", | ||||||
| 				"right-left" => "", | 				"right-left" => "", | ||||||
| 				"right-right" => "", | 				"right-right" => "", | ||||||
| @ -119,7 +144,7 @@ class frontend{ | |||||||
| 		die(); | 		die(); | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
| 	public function drawscrapererror($error, $get, $target){ | 	public function drawscrapererror($error, $get, $target, $timetaken){ | ||||||
| 		 | 		 | ||||||
| 		$this->drawerror( | 		$this->drawerror( | ||||||
| 			"Shit", | 			"Shit", | ||||||
| @ -131,7 +156,8 @@ class frontend{ | |||||||
| 				'<li>Remove keywords that could cause errors</li>' . | 				'<li>Remove keywords that could cause errors</li>' . | ||||||
| 				'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' . | 				'<li><a href="/instances?target=' . $target . "&" . $this->buildquery($get, false) . '">Try your search on another 4get instance</a></li>' . | ||||||
| 			'</ul><br>' . | 			'</ul><br>' . | ||||||
| 			'If the error persists, please <a href="/about">contact the administrator</a>.' | 			'If the error persists, please <a href="/about">contact the administrator</a>.', | ||||||
|  | 			$timetaken | ||||||
| 		); | 		); | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
| @ -483,10 +509,6 @@ class frontend{ | |||||||
| 						$archives[] = "warosu.org"; | 						$archives[] = "warosu.org"; | ||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| 					case "cm": |  | ||||||
| 						$archives[] = "boards.fireden.net"; |  | ||||||
| 						break; |  | ||||||
| 					 |  | ||||||
| 					case "f": | 					case "f": | ||||||
| 						$archives[] = "archive.4plebs.org"; | 						$archives[] = "archive.4plebs.org"; | ||||||
| 						break; | 						break; | ||||||
| @ -503,12 +525,10 @@ class frontend{ | |||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| 					case "v": | 					case "v": | ||||||
| 						$archives[] = "boards.fireden.net"; |  | ||||||
| 						$archives[] = "arch.b4k.co"; | 						$archives[] = "arch.b4k.co"; | ||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| 					case "vg": | 					case "vg": | ||||||
| 						$archives[] = "boards.fireden.net"; |  | ||||||
| 						$archives[] = "arch.b4k.co"; | 						$archives[] = "arch.b4k.co"; | ||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| @ -579,7 +599,6 @@ class frontend{ | |||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| 					case "sci": | 					case "sci": | ||||||
| 						$archives[] = "boards.fireden.net"; |  | ||||||
| 						$archives[] = "warosu.org"; | 						$archives[] = "warosu.org"; | ||||||
| 						$archives[] = "eientei.xyz"; | 						$archives[] = "eientei.xyz"; | ||||||
| 						break; | 						break; | ||||||
| @ -614,7 +633,6 @@ class frontend{ | |||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| 					case "ic": | 					case "ic": | ||||||
| 						$archives[] = "boards.fireden.net"; |  | ||||||
| 						$archives[] = "warosu.org"; | 						$archives[] = "warosu.org"; | ||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| @ -741,10 +759,6 @@ class frontend{ | |||||||
| 						$archives[] = "desuarchive.org"; | 						$archives[] = "desuarchive.org"; | ||||||
| 						break; | 						break; | ||||||
| 					 | 					 | ||||||
| 					case "y": |  | ||||||
| 						$archives[] = "boards.fireden.net"; |  | ||||||
| 						break; |  | ||||||
| 					 |  | ||||||
| 					case "t": | 					case "t": | ||||||
| 						$archives[] = "archiveofsins.com"; | 						$archives[] = "archiveofsins.com"; | ||||||
| 						break; | 						break; | ||||||
| @ -802,7 +816,7 @@ class frontend{ | |||||||
| 		$payload .= | 		$payload .= | ||||||
| 				'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' . | 				'<a href="https://webcache.googleusercontent.com/search?q=cache:' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://google.com" alt="go">Google cache</a>' . | ||||||
| 				'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' . | 				'<a href="https://web.archive.org/web/' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.org" alt="ar">Archive.org</a>' . | ||||||
| 				'<a href="https://archive.is/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' . | 				'<a href="https://archive.ph/newest/' . htmlspecialchars($link) . '" class="list" target="_BLANK"><img src="/favicon?s=https://archive.is" alt="ar">Archive.is</a>' . | ||||||
| 				'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' . | 				'<a href="https://ghostarchive.org/search?term=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://ghostarchive.org" alt="gh">Ghostarchive</a>' . | ||||||
| 				'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' . | 				'<a href="https://www.bing.com/search?q=url%3A' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://bing.com" alt="bi">Bing cache</a>' . | ||||||
| 				'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' . | 				'<a href="https://megalodon.jp/?url=' . $urlencode . '" class="list" target="_BLANK"><img src="/favicon?s=https://megalodon.jp" alt="me">Megalodon</a>' . | ||||||
|  | |||||||
| @ -73,7 +73,7 @@ class fuckhtml{ | |||||||
| 			$attributes = []; | 			$attributes = []; | ||||||
| 
 | 
 | ||||||
| 			preg_match_all( | 			preg_match_all( | ||||||
| 				'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/', | 				'/([^\/\s\\=]+)(?:\s*=\s*("[^"]*"|\'[^\']*\'|[^\s]*))?/i', | ||||||
| 				$starting_tags[2][$i][0], | 				$starting_tags[2][$i][0], | ||||||
| 				$regex_attributes | 				$regex_attributes | ||||||
| 			); | 			); | ||||||
| @ -88,7 +88,7 @@ class fuckhtml{ | |||||||
| 					continue; | 					continue; | ||||||
| 				} | 				} | ||||||
| 				 | 				 | ||||||
| 				$attributes[$regex_attributes[1][$k]] = | 				$attributes[strtolower($regex_attributes[1][$k])] = | ||||||
| 					trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00"); | 					trim($regex_attributes[2][$k], "'\" \n\r\t\v\x00"); | ||||||
| 			} | 			} | ||||||
| 			 | 			 | ||||||
|  | |||||||
| @ -31,7 +31,7 @@ try{ | |||||||
| 	 | 	 | ||||||
| }catch(Exception $error){ | }catch(Exception $error){ | ||||||
| 	 | 	 | ||||||
| 	$frontend->drawscrapererror($error->getMessage(), $get, "music"); | 	$frontend->drawscrapererror($error->getMessage(), $get, "music", $payload["timetaken"]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| $categories = [ | $categories = [ | ||||||
|  | |||||||
							
								
								
									
										2
									
								
								news.php
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								news.php
									
									
									
									
									
								
							| @ -31,7 +31,7 @@ try{ | |||||||
| 	 | 	 | ||||||
| }catch(Exception $error){ | }catch(Exception $error){ | ||||||
| 	 | 	 | ||||||
| 	$frontend->drawscrapererror($error->getMessage(), $get, "news"); | 	$frontend->drawscrapererror($error->getMessage(), $get, "news", $payload["timetaken"]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* | /* | ||||||
|  | |||||||
| @ -654,6 +654,7 @@ class google{ | |||||||
| 				 | 				 | ||||||
| 				throw new Exception("Failed to get HTML"); | 				throw new Exception("Failed to get HTML"); | ||||||
| 			} | 			} | ||||||
|  | 			 | ||||||
| 			//$html = file_get_contents("scraper/google.html");
 | 			//$html = file_get_contents("scraper/google.html");
 | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
|  | |||||||
| @ -31,7 +31,7 @@ try{ | |||||||
| 	 | 	 | ||||||
| }catch(Exception $error){ | }catch(Exception $error){ | ||||||
| 	 | 	 | ||||||
| 	$frontend->drawscrapererror($error->getMessage(), $get, "videos"); | 	$frontend->drawscrapererror($error->getMessage(), $get, "videos", $payload["timetaken"]); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| $categories = [ | $categories = [ | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 lolcat
						lolcat