added mwmbl scraper
This commit is contained in:
		
							parent
							
								
									8944ca6894
								
							
						
					
					
						commit
						a20d4de1e4
					
				| @ -104,6 +104,7 @@ class config{ | ||||
| 	const PROXY_PINTEREST = false; | ||||
| 	const PROXY_SEZNAM = false; | ||||
| 	const PROXY_NAVER = false; | ||||
| 	const PROXY_MWMBL = false; | ||||
| 	const PROXY_FTM = false; // findthatmeme
 | ||||
| 	const PROXY_IMGUR = false; | ||||
| 	const PROXY_YANDEX_W = false; // yandex web
 | ||||
|  | ||||
| @ -12,3 +12,5 @@ This guide assumes that there is already a configured webserver sitting on port | ||||
| 5. Restart the tor service using `service tor restart` | ||||
| 6. Wait for a while... | ||||
| 7. Run `cat /var/lib/tor/4get/hostname`. That is your onion address! | ||||
| 
 | ||||
| # Specify your own tor address | ||||
|  | ||||
| @ -902,6 +902,7 @@ class frontend{ | ||||
| 						"yandex" => "Yandex", | ||||
| 						"google" => "Google", | ||||
| 						"yep" => "Yep", | ||||
| 						"mwmbl" => "Mwmbl", | ||||
| 						"mojeek" => "Mojeek", | ||||
| 						"marginalia" => "Marginalia", | ||||
| 						"wiby" => "wiby", | ||||
| @ -1018,6 +1019,11 @@ class frontend{ | ||||
| 				$lib = new facebook(); | ||||
| 				break;*/ | ||||
| 			 | ||||
| 			case "mwmbl": | ||||
| 				include "scraper/mwmbl.php"; | ||||
| 				$lib = new mwmbl(); | ||||
| 				break; | ||||
| 				 | ||||
| 			case "mojeek": | ||||
| 				include "scraper/mojeek.php"; | ||||
| 				$lib = new mojeek(); | ||||
|  | ||||
| @ -538,8 +538,7 @@ class google{ | ||||
| 			$url .= "?" . $get; | ||||
| 		} | ||||
| 		 | ||||
| 		//curl_setopt($curlproc, CURLOPT_URL, $url);
 | ||||
| 		curl_setopt($curlproc, CURLOPT_URL, "https://ifconfig.co"); | ||||
| 		curl_setopt($curlproc, CURLOPT_URL, $url); | ||||
| 		 | ||||
| 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||||
| 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, $headers); | ||||
| @ -560,7 +559,6 @@ class google{ | ||||
| 		} | ||||
| 		 | ||||
| 		curl_close($curlproc); | ||||
| 		echo $data; | ||||
| 		return $data; | ||||
| 	} | ||||
| 	 | ||||
|  | ||||
							
								
								
									
										168
									
								
								scraper/mwmbl.php
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										168
									
								
								scraper/mwmbl.php
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,168 @@ | ||||
| <?php | ||||
| 
 | ||||
| class mwmbl{ | ||||
| 	 | ||||
| 	public function __construct(){ | ||||
| 		 | ||||
| 		include "lib/backend.php"; | ||||
| 		$this->backend = new backend("mwmbl"); | ||||
| 		 | ||||
| 		include "lib/fuckhtml.php"; | ||||
| 		$this->fuckhtml = new fuckhtml(); | ||||
| 	} | ||||
| 	 | ||||
| 	public function getfilters($page){ | ||||
| 		 | ||||
| 		return []; | ||||
| 	} | ||||
| 	 | ||||
| 	private function get($proxy, $url, $get = []){ | ||||
| 		 | ||||
| 		$curlproc = curl_init(); | ||||
| 		 | ||||
| 		if($get !== []){ | ||||
| 			$get = http_build_query($get); | ||||
| 			$url .= "?" . $get; | ||||
| 		} | ||||
| 		 | ||||
| 		curl_setopt($curlproc, CURLOPT_URL, $url); | ||||
| 		 | ||||
| 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||||
| 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, | ||||
| 			["User-Agent: " . config::USER_AGENT, | ||||
| 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | ||||
| 			"Accept-Language: en-US,en;q=0.5", | ||||
| 			"Accept-Encoding: gzip", | ||||
| 			"DNT: 1", | ||||
| 			"Connection: keep-alive", | ||||
| 			"Upgrade-Insecure-Requests: 1", | ||||
| 			"Sec-Fetch-Dest: document", | ||||
| 			"Sec-Fetch-Mode: navigate", | ||||
| 			"Sec-Fetch-Site: none", | ||||
| 			"Sec-Fetch-User: ?1"] | ||||
| 		); | ||||
| 		 | ||||
| 		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true); | ||||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); | ||||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); | ||||
| 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); | ||||
| 		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); | ||||
| 		 | ||||
| 		$this->backend->assign_proxy($curlproc, $proxy); | ||||
| 		 | ||||
| 		$data = curl_exec($curlproc); | ||||
| 		 | ||||
| 		if(curl_errno($curlproc)){ | ||||
| 			 | ||||
| 			throw new Exception(curl_error($curlproc)); | ||||
| 		} | ||||
| 		 | ||||
| 		curl_close($curlproc); | ||||
| 		return $data; | ||||
| 	} | ||||
| 	 | ||||
| 	public function web($get){ | ||||
| 		 | ||||
| 		$search = $get["s"]; | ||||
| 		if(strlen($search) === 0){ | ||||
| 			 | ||||
| 			throw new Exception("Search term is empty!"); | ||||
| 		} | ||||
| 		 | ||||
| 		try{ | ||||
| 			$html = $this->get( | ||||
| 				$this->backend->get_ip(), // no next page!
 | ||||
| 				"https://mwmbl.org/app/home/", | ||||
| 				[ | ||||
| 					"q" => $search | ||||
| 				] | ||||
| 			); | ||||
| 		}catch(Exception $error){ | ||||
| 			 | ||||
| 			throw new Exception("Failed to fetch HTML"); | ||||
| 		} | ||||
| 		 | ||||
| 		$out = [ | ||||
| 			"status" => "ok", | ||||
| 			"spelling" => [ | ||||
| 				"type" => "no_correction", | ||||
| 				"using" => null, | ||||
| 				"correction" => null | ||||
| 			], | ||||
| 			"npt" => null, | ||||
| 			"answer" => [], | ||||
| 			"web" => [], | ||||
| 			"image" => [], | ||||
| 			"video" => [], | ||||
| 			"news" => [], | ||||
| 			"related" => [] | ||||
| 		]; | ||||
| 		 | ||||
| 		$this->fuckhtml->load($html); | ||||
| 		 | ||||
| 		$results = | ||||
| 			$this->fuckhtml | ||||
| 			->getElementsByClassName( | ||||
| 				"result", | ||||
| 				"li" | ||||
| 			); | ||||
| 		 | ||||
| 		foreach($results as $result){ | ||||
| 			 | ||||
| 			$this->fuckhtml->load($result); | ||||
| 			 | ||||
| 			$p = | ||||
| 				$this->fuckhtml | ||||
| 				->getElementsByTagName("p"); | ||||
| 			 | ||||
| 			$out["web"][] = [ | ||||
| 				"title" => | ||||
| 					$this->titledots( | ||||
| 						$this->fuckhtml | ||||
| 						->getTextContent( | ||||
| 							$this->fuckhtml | ||||
| 							->getElementsByClassName( | ||||
| 								"title", | ||||
| 								$p | ||||
| 							)[0] | ||||
| 						) | ||||
| 					), | ||||
| 				"description" => | ||||
| 					$this->titledots( | ||||
| 						$this->fuckhtml | ||||
| 						->getTextContent( | ||||
| 							$this->fuckhtml | ||||
| 							->getElementsByClassName( | ||||
| 								"extract", | ||||
| 								$p | ||||
| 							)[0] | ||||
| 						) | ||||
| 					), | ||||
| 				"url" => | ||||
| 					$this->fuckhtml | ||||
| 					->getTextContent( | ||||
| 						$this->fuckhtml | ||||
| 						->getElementsByTagName("a") | ||||
| 						[0] | ||||
| 						["attributes"] | ||||
| 						["href"] | ||||
| 					), | ||||
| 				"date" => null, | ||||
| 				"type" => "web", | ||||
| 				"thumb" => [ | ||||
| 					"url" => null, | ||||
| 					"ratio" => null | ||||
| 				], | ||||
| 				"sublink" => [], | ||||
| 				"table" => [] | ||||
| 			]; | ||||
| 		} | ||||
| 		 | ||||
| 		return $out; | ||||
| 	} | ||||
| 	 | ||||
| 	private function titledots($title){ | ||||
| 		 | ||||
| 		return rtrim($title, "…"); | ||||
| 	} | ||||
| } | ||||
| @ -125,6 +125,10 @@ $settings = [ | ||||
| 						"value" => "yep", | ||||
| 						"text" => "Yep" | ||||
| 					], | ||||
| 					[ | ||||
| 						"value" => "mwmbl", | ||||
| 						"text" => "Mwmbl" | ||||
| 					], | ||||
| 					[ | ||||
| 						"value" => "mojeek", | ||||
| 						"text" => "Mojeek" | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 lolcat
						lolcat