added mwmbl scraper
This commit is contained in:
		
							parent
							
								
									8944ca6894
								
							
						
					
					
						commit
						a20d4de1e4
					
				| @ -104,6 +104,7 @@ class config{ | |||||||
| 	const PROXY_PINTEREST = false; | 	const PROXY_PINTEREST = false; | ||||||
| 	const PROXY_SEZNAM = false; | 	const PROXY_SEZNAM = false; | ||||||
| 	const PROXY_NAVER = false; | 	const PROXY_NAVER = false; | ||||||
|  | 	const PROXY_MWMBL = false; | ||||||
| 	const PROXY_FTM = false; // findthatmeme
 | 	const PROXY_FTM = false; // findthatmeme
 | ||||||
| 	const PROXY_IMGUR = false; | 	const PROXY_IMGUR = false; | ||||||
| 	const PROXY_YANDEX_W = false; // yandex web
 | 	const PROXY_YANDEX_W = false; // yandex web
 | ||||||
|  | |||||||
| @ -12,3 +12,5 @@ This guide assumes that there is already a configured webserver sitting on port | |||||||
| 5. Restart the tor service using `service tor restart` | 5. Restart the tor service using `service tor restart` | ||||||
| 6. Wait for a while... | 6. Wait for a while... | ||||||
| 7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address! | 7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address! | ||||||
|  | 
 | ||||||
|  | # Specify your own tor address | ||||||
|  | |||||||
| @ -902,6 +902,7 @@ class frontend{ | |||||||
| 						"yandex" => "Yandex", | 						"yandex" => "Yandex", | ||||||
| 						"google" => "Google", | 						"google" => "Google", | ||||||
| 						"yep" => "Yep", | 						"yep" => "Yep", | ||||||
|  | 						"mwmbl" => "Mwmbl", | ||||||
| 						"mojeek" => "Mojeek", | 						"mojeek" => "Mojeek", | ||||||
| 						"marginalia" => "Marginalia", | 						"marginalia" => "Marginalia", | ||||||
| 						"wiby" => "wiby", | 						"wiby" => "wiby", | ||||||
| @ -1018,6 +1019,11 @@ class frontend{ | |||||||
| 				$lib = new facebook(); | 				$lib = new facebook(); | ||||||
| 				break;*/ | 				break;*/ | ||||||
| 			 | 			 | ||||||
|  | 			case "mwmbl": | ||||||
|  | 				include "scraper/mwmbl.php"; | ||||||
|  | 				$lib = new mwmbl(); | ||||||
|  | 				break; | ||||||
|  | 				 | ||||||
| 			case "mojeek": | 			case "mojeek": | ||||||
| 				include "scraper/mojeek.php"; | 				include "scraper/mojeek.php"; | ||||||
| 				$lib = new mojeek(); | 				$lib = new mojeek(); | ||||||
|  | |||||||
| @ -538,8 +538,7 @@ class google{ | |||||||
| 			$url .= "?" . $get; | 			$url .= "?" . $get; | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
| 		//curl_setopt($curlproc, CURLOPT_URL, $url);
 | 		curl_setopt($curlproc, CURLOPT_URL, $url); | ||||||
| 		curl_setopt($curlproc, CURLOPT_URL, "https://ifconfig.co"); |  | ||||||
| 		 | 		 | ||||||
| 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||||||
| 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); | 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); | ||||||
| @ -560,7 +559,6 @@ class google{ | |||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
| 		curl_close($curlproc); | 		curl_close($curlproc); | ||||||
| 		echo $data; |  | ||||||
| 		return $data; | 		return $data; | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
|  | |||||||
							
								
								
									
										168
									
								
								scraper/mwmbl.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								scraper/mwmbl.php
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,168 @@ | |||||||
|  | <?php | ||||||
|  | 
 | ||||||
|  | class mwmbl{ | ||||||
|  | 	 | ||||||
|  | 	public function __construct(){ | ||||||
|  | 		 | ||||||
|  | 		include "lib/backend.php"; | ||||||
|  | 		$this->backend = new backend("mwmbl"); | ||||||
|  | 		 | ||||||
|  | 		include "lib/fuckhtml.php"; | ||||||
|  | 		$this->fuckhtml = new fuckhtml(); | ||||||
|  | 	} | ||||||
|  | 	 | ||||||
|  | 	public function getfilters($page){ | ||||||
|  | 		 | ||||||
|  | 		return []; | ||||||
|  | 	} | ||||||
|  | 	 | ||||||
|  | 	private function get($proxy, $url, $get = []){ | ||||||
|  | 		 | ||||||
|  | 		$curlproc = curl_init(); | ||||||
|  | 		 | ||||||
|  | 		if($get !== []){ | ||||||
|  | 			$get = http_build_query($get); | ||||||
|  | 			$url .= "?" . $get; | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_URL, $url); | ||||||
|  | 		 | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, | ||||||
|  | 			["User-Agent: " . config::USER_AGENT, | ||||||
|  | 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | ||||||
|  | 			"Accept-Language: en-US,en;q=0.5", | ||||||
|  | 			"Accept-Encoding: gzip", | ||||||
|  | 			"DNT: 1", | ||||||
|  | 			"Connection: keep-alive", | ||||||
|  | 			"Upgrade-Insecure-Requests: 1", | ||||||
|  | 			"Sec-Fetch-Dest: document", | ||||||
|  | 			"Sec-Fetch-Mode: navigate", | ||||||
|  | 			"Sec-Fetch-Site: none", | ||||||
|  | 			"Sec-Fetch-User: ?1"] | ||||||
|  | 		); | ||||||
|  | 		 | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); | ||||||
|  | 		 | ||||||
|  | 		$this->backend->assign_proxy($curlproc, $proxy); | ||||||
|  | 		 | ||||||
|  | 		$data = curl_exec($curlproc); | ||||||
|  | 		 | ||||||
|  | 		if(curl_errno($curlproc)){ | ||||||
|  | 			 | ||||||
|  | 			throw new Exception(curl_error($curlproc)); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		curl_close($curlproc); | ||||||
|  | 		return $data; | ||||||
|  | 	} | ||||||
|  | 	 | ||||||
|  | 	public function web($get){ | ||||||
|  | 		 | ||||||
|  | 		$search = $get["s"]; | ||||||
|  | 		if(strlen($search) === 0){ | ||||||
|  | 			 | ||||||
|  | 			throw new Exception("Search term is empty!"); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		try{ | ||||||
|  | 			$html = $this->get( | ||||||
|  | 				$this->backend->get_ip(), // no next page!
 | ||||||
|  | 				"https://mwmbl.org/app/home/", | ||||||
|  | 				[ | ||||||
|  | 					"q" => $search | ||||||
|  | 				] | ||||||
|  | 			); | ||||||
|  | 		}catch(Exception $error){ | ||||||
|  | 			 | ||||||
|  | 			throw new Exception("Failed to fetch HTML"); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		$out = [ | ||||||
|  | 			"status" => "ok", | ||||||
|  | 			"spelling" => [ | ||||||
|  | 				"type" => "no_correction", | ||||||
|  | 				"using" => null, | ||||||
|  | 				"correction" => null | ||||||
|  | 			], | ||||||
|  | 			"npt" => null, | ||||||
|  | 			"answer" => [], | ||||||
|  | 			"web" => [], | ||||||
|  | 			"image" => [], | ||||||
|  | 			"video" => [], | ||||||
|  | 			"news" => [], | ||||||
|  | 			"related" => [] | ||||||
|  | 		]; | ||||||
|  | 		 | ||||||
|  | 		$this->fuckhtml->load($html); | ||||||
|  | 		 | ||||||
|  | 		$results = | ||||||
|  | 			$this->fuckhtml | ||||||
|  | 			->getElementsByClassName( | ||||||
|  | 				"result", | ||||||
|  | 				"li" | ||||||
|  | 			); | ||||||
|  | 		 | ||||||
|  | 		foreach($results as $result){ | ||||||
|  | 			 | ||||||
|  | 			$this->fuckhtml->load($result); | ||||||
|  | 			 | ||||||
|  | 			$p = | ||||||
|  | 				$this->fuckhtml | ||||||
|  | 				->getElementsByTagName("p"); | ||||||
|  | 			 | ||||||
|  | 			$out["web"][] = [ | ||||||
|  | 				"title" => | ||||||
|  | 					$this->titledots( | ||||||
|  | 						$this->fuckhtml | ||||||
|  | 						->getTextContent( | ||||||
|  | 							$this->fuckhtml | ||||||
|  | 							->getElementsByClassName( | ||||||
|  | 								"title", | ||||||
|  | 								$p | ||||||
|  | 							)[0] | ||||||
|  | 						) | ||||||
|  | 					), | ||||||
|  | 				"description" => | ||||||
|  | 					$this->titledots( | ||||||
|  | 						$this->fuckhtml | ||||||
|  | 						->getTextContent( | ||||||
|  | 							$this->fuckhtml | ||||||
|  | 							->getElementsByClassName( | ||||||
|  | 								"extract", | ||||||
|  | 								$p | ||||||
|  | 							)[0] | ||||||
|  | 						) | ||||||
|  | 					), | ||||||
|  | 				"url" => | ||||||
|  | 					$this->fuckhtml | ||||||
|  | 					->getTextContent( | ||||||
|  | 						$this->fuckhtml | ||||||
|  | 						->getElementsByTagName("a") | ||||||
|  | 						[0] | ||||||
|  | 						["attributes"] | ||||||
|  | 						["href"] | ||||||
|  | 					), | ||||||
|  | 				"date" => null, | ||||||
|  | 				"type" => "web", | ||||||
|  | 				"thumb" => [ | ||||||
|  | 					"url" => null, | ||||||
|  | 					"ratio" => null | ||||||
|  | 				], | ||||||
|  | 				"sublink" => [], | ||||||
|  | 				"table" => [] | ||||||
|  | 			]; | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		return $out; | ||||||
|  | 	} | ||||||
|  | 	 | ||||||
|  | 	private function titledots($title){ | ||||||
|  | 		 | ||||||
|  | 		return rtrim($title, "…"); | ||||||
|  | 	} | ||||||
|  | } | ||||||
| @ -125,6 +125,10 @@ $settings = [ | |||||||
| 						"value" => "yep", | 						"value" => "yep", | ||||||
| 						"text" => "Yep" | 						"text" => "Yep" | ||||||
| 					], | 					], | ||||||
|  | 					[ | ||||||
|  | 						"value" => "mwmbl", | ||||||
|  | 						"text" => "Mwmbl" | ||||||
|  | 					], | ||||||
| 					[ | 					[ | ||||||
| 						"value" => "mojeek", | 						"value" => "mojeek", | ||||||
| 						"text" => "Mojeek" | 						"text" => "Mojeek" | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 lolcat
						lolcat