fix yandex web
This commit is contained in:
		
							parent
							
								
									3e1487e614
								
							
						
					
					
						commit
						f73b5f0298
					
				| @ -14,7 +14,7 @@ class yandex{ | |||||||
| 		// backend included in the scraper functions
 | 		// backend included in the scraper functions
 | ||||||
| 	} | 	} | ||||||
| 	 | 	 | ||||||
| 	private function get($proxy, $url, $get = [], $nsfw){ | 	private function get($proxy, $url, $get = [], $nsfw, $get_cookie = 1){ | ||||||
| 		 | 		 | ||||||
| 		$curlproc = curl_init(); | 		$curlproc = curl_init(); | ||||||
| 		 | 		 | ||||||
| @ -25,19 +25,55 @@ class yandex{ | |||||||
| 		 | 		 | ||||||
| 		curl_setopt($curlproc, CURLOPT_URL, $url); | 		curl_setopt($curlproc, CURLOPT_URL, $url); | ||||||
| 		 | 		 | ||||||
|  | 		// extract "i" cookie
 | ||||||
|  | 		if($get_cookie === 0){ | ||||||
|  | 			 | ||||||
|  | 			$cookies_tmp = []; | ||||||
|  | 			curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){ | ||||||
|  | 				 | ||||||
|  | 				$length = strlen($header); | ||||||
|  | 				 | ||||||
|  | 				$header = explode(":", $header, 2); | ||||||
|  | 				 | ||||||
|  | 				if(trim(strtolower($header[0])) == "set-cookie"){ | ||||||
|  | 					 | ||||||
|  | 					$cookie_tmp = explode("=", trim($header[1]), 2); | ||||||
|  | 					 | ||||||
|  | 					$cookies_tmp[trim($cookie_tmp[0])] = | ||||||
|  | 						explode(";", $cookie_tmp[1], 2)[0]; | ||||||
|  | 				} | ||||||
|  | 				 | ||||||
|  | 				return $length; | ||||||
|  | 			}); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
| 		switch($nsfw){ | 		switch($nsfw){ | ||||||
| 			case "yes": $nsfw = "0"; break; | 			case "yes": $nsfw = "0"; break; | ||||||
| 			case "maybe": $nsfw = "1"; break; | 			case "maybe": $nsfw = "1"; break; | ||||||
| 			case "no": $nsfw = "2"; break; | 			case "no": $nsfw = "2"; break; | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
|  | 		switch($get_cookie){ | ||||||
|  | 			 | ||||||
|  | 			case 0: | ||||||
|  | 				$cookie = ""; | ||||||
|  | 				break; | ||||||
|  | 			 | ||||||
|  | 			case 1: | ||||||
|  | 				$cookie = "Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw; | ||||||
|  | 				break; | ||||||
|  | 			 | ||||||
|  | 			default: | ||||||
|  | 				$cookie = "Cookie: i=" . $get_cookie; | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
| 		$headers = | 		$headers = | ||||||
| 			["User-Agent: " . config::USER_AGENT, | 			["User-Agent: " . config::USER_AGENT, | ||||||
| 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | ||||||
| 			"Accept-Encoding: gzip", | 			"Accept-Encoding: gzip", | ||||||
| 			"Accept-Language: en-US,en;q=0.5", | 			"Accept-Language: en-US,en;q=0.5", | ||||||
| 			"DNT: 1", | 			"DNT: 1", | ||||||
| 			"Cookie: yp=" . (time() - 4000033) . ".szm.1:1920x1080:876x1000#" . time() . ".sp.family:" . $nsfw, | 			$cookie, | ||||||
| 			"Referer: https://yandex.com/images/search", | 			"Referer: https://yandex.com/images/search", | ||||||
| 			"Connection: keep-alive", | 			"Connection: keep-alive", | ||||||
| 			"Upgrade-Insecure-Requests: 1", | 			"Upgrade-Insecure-Requests: 1", | ||||||
| @ -59,6 +95,17 @@ class yandex{ | |||||||
| 		 | 		 | ||||||
| 		$data = curl_exec($curlproc); | 		$data = curl_exec($curlproc); | ||||||
| 		 | 		 | ||||||
|  | 		if($get_cookie === 0){ | ||||||
|  | 			 | ||||||
|  | 			if(isset($cookies_tmp["i"])){ | ||||||
|  | 				 | ||||||
|  | 				return $cookies_tmp["i"]; | ||||||
|  | 			}else{ | ||||||
|  | 				 | ||||||
|  | 				throw new Exception("Failed to get Yandex clearance cookie"); | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
| 		if(curl_errno($curlproc)){ | 		if(curl_errno($curlproc)){ | ||||||
| 			 | 			 | ||||||
| 			throw new Exception(curl_error($curlproc)); | 			throw new Exception(curl_error($curlproc)); | ||||||
| @ -217,6 +264,23 @@ class yandex{ | |||||||
| 		// https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712
 | 		// https://yandex.com/search/site/?text=minecraft&web=1&frame=1&v=2.0&searchid=3131712
 | ||||||
| 		// &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023
 | 		// &within=777&from_day=26&from_month=8&from_year=2023&to_day=26&to_month=8&to_year=2023
 | ||||||
| 		 | 		 | ||||||
|  | 		// get clearance cookie
 | ||||||
|  | 		if(($cookie = apcu_fetch("yandexweb_cookie")) === false){ | ||||||
|  | 			 | ||||||
|  | 			$proxy = $this->backend->get_ip(); | ||||||
|  | 			 | ||||||
|  | 			$cookie = | ||||||
|  | 				$this->get( | ||||||
|  | 					$proxy, | ||||||
|  | 					"https://yandex.ru/support2/smart-captcha/ru/", | ||||||
|  | 					[], | ||||||
|  | 					false, | ||||||
|  | 					0 | ||||||
|  | 				); | ||||||
|  | 			 | ||||||
|  | 			apcu_store("yandexweb_cookie", $cookie); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
| 		if($get["npt"]){ | 		if($get["npt"]){ | ||||||
| 			 | 			 | ||||||
| 			[$npt, $proxy] = $this->backend->get($get["npt"], "web"); | 			[$npt, $proxy] = $this->backend->get($get["npt"], "web"); | ||||||
| @ -226,7 +290,8 @@ class yandex{ | |||||||
| 					$proxy, | 					$proxy, | ||||||
| 					"https://yandex.com" . $npt, | 					"https://yandex.com" . $npt, | ||||||
| 					[], | 					[], | ||||||
| 					"yes" | 					"yes", | ||||||
|  | 					$cookie | ||||||
| 				); | 				); | ||||||
| 		}else{ | 		}else{ | ||||||
| 			 | 			 | ||||||
| @ -236,7 +301,7 @@ class yandex{ | |||||||
| 				throw new Exception("Search term is empty!"); | 				throw new Exception("Search term is empty!"); | ||||||
| 			} | 			} | ||||||
| 			 | 			 | ||||||
| 			$proxy = $this->backend->get_ip(); | 			$proxy = !isset($proxy) ? $this->backend->get_ip() : $proxy; | ||||||
| 			$lang = $get["lang"]; | 			$lang = $get["lang"]; | ||||||
| 			$older = $get["older"]; | 			$older = $get["older"]; | ||||||
| 			$newer = $get["newer"]; | 			$newer = $get["newer"]; | ||||||
| @ -283,7 +348,8 @@ class yandex{ | |||||||
| 						$proxy, | 						$proxy, | ||||||
| 						"https://yandex.com/search/site/", | 						"https://yandex.com/search/site/", | ||||||
| 						$params, | 						$params, | ||||||
| 						"yes" | 						"yes", | ||||||
|  | 						$cookie | ||||||
| 					); | 					); | ||||||
| 			}catch(Exception $error){ | 			}catch(Exception $error){ | ||||||
| 				 | 				 | ||||||
| @ -314,6 +380,19 @@ class yandex{ | |||||||
| 		 | 		 | ||||||
| 		$this->fuckhtml->load($html); | 		$this->fuckhtml->load($html); | ||||||
| 		 | 		 | ||||||
|  | 		// Scrape page blocked error
 | ||||||
|  | 		$title = | ||||||
|  | 			$this->fuckhtml | ||||||
|  | 			->getElementsByTagName("title"); | ||||||
|  | 		 | ||||||
|  | 		if( | ||||||
|  | 			count($title) !== 0 && | ||||||
|  | 			$title[0]["innerHTML"] == "403" | ||||||
|  | 		){ | ||||||
|  | 			 | ||||||
|  | 			throw new Exception("Yandex blocked this proxy or 4get instance."); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
| 		// get nextpage
 | 		// get nextpage
 | ||||||
| 		$npt = | 		$npt = | ||||||
| 			$this->fuckhtml | 			$this->fuckhtml | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 lolcat
						lolcat