fixed mwmbl, results are slightly better but wtf did they do to the sublinks my gawd
This commit is contained in:
		
							parent
							
								
									36993013e5
								
							
						
					
					
						commit
						fbac3eeb8d
					
				| @ -8,10 +8,10 @@ Welcome! This guide assumes that you have a working 4get instance. This will hel | |||||||
| 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` | 3. The captcha imagesets are located in `data/captcha/your_image_set/*.png` | ||||||
| 4. The captcha font is located in `data/fonts/captcha.ttf` | 4. The captcha font is located in `data/fonts/captcha.ttf` | ||||||
| 
 | 
 | ||||||
| # Cloudflare bypass | # Cloudflare bypass (TLS check) | ||||||
| **Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.** | **Note: this only allows you to bypass the browser integrity checks. Captchas & javascript challenges will not be bypassed.** | ||||||
| 
 | 
 | ||||||
| Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** search engine. Following these instructions might make your package manager unhappy. | Configuring this lets you fetch images sitting behind Cloudflare and allows you to scrape the **Yep** & the **Mwmbl** search engines. Please be aware that APT will fight against you and will re-install the openSSL-version of curl constantly when updating. | ||||||
| 
 | 
 | ||||||
| First, follow these instructions. Only install the Firefox modules: | First, follow these instructions. Only install the Firefox modules: | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -27,18 +27,24 @@ class mwmbl{ | |||||||
| 		 | 		 | ||||||
| 		curl_setopt($curlproc, CURLOPT_URL, $url); | 		curl_setopt($curlproc, CURLOPT_URL, $url); | ||||||
| 		 | 		 | ||||||
|  | 		// use http2
 | ||||||
|  | 		curl_setopt($curlproc, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0); | ||||||
|  | 		 | ||||||
| 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||||||
| 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, | 		curl_setopt($curlproc, CURLOPT_HTTPHEADER, | ||||||
| 			["User-Agent: " . config::USER_AGENT, | 			["User-Agent: " . config::USER_AGENT, | ||||||
| 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | ||||||
| 			"Accept-Language: en-US,en;q=0.5", | 			"Accept-Language: en-US,en;q=0.5", | ||||||
| 			"Accept-Encoding: gzip", | 			"Accept-Encoding: gzip", | ||||||
|  | 			"Referer: https://beta.mwmbl.org/", | ||||||
| 			"DNT: 1", | 			"DNT: 1", | ||||||
|  | 			"Sec-GPC: 1", | ||||||
| 			"Connection: keep-alive", | 			"Connection: keep-alive", | ||||||
| 			"Upgrade-Insecure-Requests: 1", | 			"Upgrade-Insecure-Requests: 1", | ||||||
| 			"Sec-Fetch-Dest: document", | 			"Sec-Fetch-Dest: document", | ||||||
| 			"Sec-Fetch-Mode: navigate", | 			"Sec-Fetch-Mode: navigate", | ||||||
| 			"Sec-Fetch-Site: none", | 			"Sec-Fetch-Site: same-origin", | ||||||
|  | 			"Priority: u=0, i", | ||||||
| 			"Sec-Fetch-User: ?1"] | 			"Sec-Fetch-User: ?1"] | ||||||
| 		); | 		); | ||||||
| 		 | 		 | ||||||
| @ -46,7 +52,7 @@ class mwmbl{ | |||||||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); | 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2); | ||||||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); | 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true); | ||||||
| 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); | 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30); | ||||||
| 		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); | 		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30); // @todo reset
 | ||||||
| 		 | 		 | ||||||
| 		$this->backend->assign_proxy($curlproc, $proxy); | 		$this->backend->assign_proxy($curlproc, $proxy); | ||||||
| 		 | 		 | ||||||
| @ -72,14 +78,14 @@ class mwmbl{ | |||||||
| 		try{ | 		try{ | ||||||
| 			$html = $this->get( | 			$html = $this->get( | ||||||
| 				$this->backend->get_ip(), // no next page!
 | 				$this->backend->get_ip(), // no next page!
 | ||||||
| 				"https://mwmbl.org/app/home/", | 				"https://beta.mwmbl.org/", | ||||||
| 				[ | 				[ | ||||||
| 					"q" => $search | 					"q" => $search | ||||||
| 				] | 				] | ||||||
| 			); | 			); | ||||||
| 		}catch(Exception $error){ | 		}catch(Exception $error){ | ||||||
| 			 | 			 | ||||||
| 			throw new Exception("Failed to fetch HTML"); | 			throw new Exception("Failed to fetch HTML. If you're getting a timeout, make sure you have curl-impersonate setup."); | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
| 		$out = [ | 		$out = [ | ||||||
| @ -115,6 +121,68 @@ class mwmbl{ | |||||||
| 				$this->fuckhtml | 				$this->fuckhtml | ||||||
| 				->getElementsByTagName("p"); | 				->getElementsByTagName("p"); | ||||||
| 			 | 			 | ||||||
|  | 			$sublinks = []; | ||||||
|  | 			 | ||||||
|  | 			$mores = | ||||||
|  | 				$this->fuckhtml | ||||||
|  | 				->getElementsByClassName( | ||||||
|  | 					"result-link-more", | ||||||
|  | 					"div" | ||||||
|  | 				); | ||||||
|  | 			 | ||||||
|  | 			foreach($mores as $more){ | ||||||
|  | 				 | ||||||
|  | 				$this->fuckhtml->load($more); | ||||||
|  | 				 | ||||||
|  | 				$as = | ||||||
|  | 					$this->fuckhtml | ||||||
|  | 					->getElementsByClassName( | ||||||
|  | 						"more", | ||||||
|  | 						"a" | ||||||
|  | 					); | ||||||
|  | 				 | ||||||
|  | 				if(count($as) === 0){ | ||||||
|  | 					 | ||||||
|  | 					// ?? invalid
 | ||||||
|  | 					continue; | ||||||
|  | 				} | ||||||
|  | 				 | ||||||
|  | 				$sublinks[] = [ | ||||||
|  | 					"title" => | ||||||
|  | 						$this->titledots( | ||||||
|  | 							$this->fuckhtml | ||||||
|  | 							->getTextContent( | ||||||
|  | 								$this->fuckhtml | ||||||
|  | 								->getElementsByClassName( | ||||||
|  | 									"more-title", | ||||||
|  | 									"span" | ||||||
|  | 								)[0] | ||||||
|  | 							) | ||||||
|  | 						), | ||||||
|  | 					"description" => | ||||||
|  | 						$this->titledots( | ||||||
|  | 							$this->fuckhtml | ||||||
|  | 							->getTextContent( | ||||||
|  | 								$this->fuckhtml | ||||||
|  | 								->getElementsByClassName( | ||||||
|  | 									"more-extract", | ||||||
|  | 									"span" | ||||||
|  | 								)[0] | ||||||
|  | 							) | ||||||
|  | 						), | ||||||
|  | 					"url" => | ||||||
|  | 						$this->fuckhtml | ||||||
|  | 						->getTextContent( | ||||||
|  | 							$as[0] | ||||||
|  | 							["attributes"] | ||||||
|  | 							["href"] | ||||||
|  | 						) | ||||||
|  | 				]; | ||||||
|  | 			} | ||||||
|  | 			 | ||||||
|  | 			// reset
 | ||||||
|  | 			$this->fuckhtml->load($result); | ||||||
|  | 			 | ||||||
| 			$out["web"][] = [ | 			$out["web"][] = [ | ||||||
| 				"title" => | 				"title" => | ||||||
| 					$this->titledots( | 					$this->titledots( | ||||||
| @ -153,7 +221,7 @@ class mwmbl{ | |||||||
| 					"url" => null, | 					"url" => null, | ||||||
| 					"ratio" => null | 					"ratio" => null | ||||||
| 				], | 				], | ||||||
| 				"sublink" => [], | 				"sublink" => $sublinks, | ||||||
| 				"table" => [] | 				"table" => [] | ||||||
| 			]; | 			]; | ||||||
| 		} | 		} | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 lolcat
						lolcat