fixed yandex image scraper
This commit is contained in:
		
							parent
							
								
									5236452f45
								
							
						
					
					
						commit
						165d80f80b
					
				| @ -636,6 +636,7 @@ class yandex{ | |||||||
| 			 | 			 | ||||||
| 			throw new Exception("Failed to get JSON"); | 			throw new Exception("Failed to get JSON"); | ||||||
| 		} | 		} | ||||||
|  | 		 | ||||||
| 		/* | 		/* | ||||||
| 		$handle = fopen("scraper/yandex.json", "r"); | 		$handle = fopen("scraper/yandex.json", "r"); | ||||||
| 		$json = fread($handle, filesize("scraper/yandex.json")); | 		$json = fread($handle, filesize("scraper/yandex.json")); | ||||||
| @ -656,68 +657,80 @@ class yandex{ | |||||||
| 			throw new Exception("Failed to decode JSON"); | 			throw new Exception("Failed to decode JSON"); | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
| 		// get html
 |  | ||||||
| 		$html = ""; |  | ||||||
| 		foreach($json["blocks"] as $block){ |  | ||||||
| 			 |  | ||||||
| 			$html .= $block["html"]; |  | ||||||
| 		} |  | ||||||
| 		 |  | ||||||
| 		$this->fuckhtml->load($html); |  | ||||||
| 		$div = $this->fuckhtml->getElementsByTagName("div"); |  | ||||||
| 		 |  | ||||||
| 		$out = [ | 		$out = [ | ||||||
| 			"status" => "ok", | 			"status" => "ok", | ||||||
| 			"npt" => null, | 			"npt" => null, | ||||||
| 			"image" => [] | 			"image" => [] | ||||||
| 		]; | 		]; | ||||||
| 		 | 		 | ||||||
| 		// check for next page
 | 		// get html
 | ||||||
| 		if( | 		$html = ""; | ||||||
| 			count( | 		foreach($json["blocks"] as $block){ | ||||||
| 				$this->fuckhtml |  | ||||||
| 				->getElementsByClassName( |  | ||||||
| 					"more more_direction_next", |  | ||||||
| 					$div |  | ||||||
| 				) |  | ||||||
| 			) !== 0 |  | ||||||
| 		){ |  | ||||||
| 			 | 			 | ||||||
| 			$request["nsfw"] = $nsfw; | 			$html .= $block["html"]; | ||||||
| 			 | 			 | ||||||
| 			if(isset($request["p"])){ | 			// get next page
 | ||||||
|  | 			if( | ||||||
|  | 				isset($block["params"]["nextPageUrl"]) && | ||||||
|  | 				!empty($block["params"]["nextPageUrl"]) | ||||||
|  | 			){ | ||||||
| 				 | 				 | ||||||
| 				$request["p"]++; | 				$request["nsfw"] = $nsfw; | ||||||
| 			}else{ |  | ||||||
| 				 | 				 | ||||||
| 				$request["p"] = 1; | 				if(isset($request["p"])){ | ||||||
|  | 					 | ||||||
|  | 					$request["p"]++; | ||||||
|  | 				}else{ | ||||||
|  | 					 | ||||||
|  | 					$request["p"] = 1; | ||||||
|  | 				} | ||||||
|  | 				 | ||||||
|  | 				$out["npt"] = | ||||||
|  | 					$this->backend->store( | ||||||
|  | 						json_encode($request), | ||||||
|  | 						"images", | ||||||
|  | 						$proxy | ||||||
|  | 					); | ||||||
| 			} | 			} | ||||||
| 			 |  | ||||||
| 			$out["npt"] = |  | ||||||
| 				$this->backend->store( |  | ||||||
| 					json_encode($request), |  | ||||||
| 					"images", |  | ||||||
| 					$proxy |  | ||||||
| 				); |  | ||||||
| 		} | 		} | ||||||
| 		 | 		 | ||||||
|  | 		$this->fuckhtml->load($html); | ||||||
|  | 		 | ||||||
| 		// get search results
 | 		// get search results
 | ||||||
|  | 		$data = null; | ||||||
|  | 		 | ||||||
| 		foreach( | 		foreach( | ||||||
| 			$this->fuckhtml | 			$this->fuckhtml | ||||||
| 			->getElementsByClassName( | 			->getElementsByClassName( | ||||||
| 				"serp-item serp-item_type_search", | 				"Root", | ||||||
| 				$div | 				"div" | ||||||
| 			) | 			) as $div | ||||||
| 			as $image |  | ||||||
| 		){ | 		){ | ||||||
| 			 | 			 | ||||||
| 			$image = | 			if(isset($div["attributes"]["data-state"])){ | ||||||
| 				json_decode( | 				 | ||||||
| 					$image | 				$tmp = json_decode( | ||||||
| 					["attributes"] | 					$this->fuckhtml | ||||||
| 					["data-bem"], | 					->getTextContent( | ||||||
|  | 						$div["attributes"]["data-state"] | ||||||
|  | 					), | ||||||
| 					true | 					true | ||||||
| 				)["serp-item"]; | 				); | ||||||
|  | 				 | ||||||
|  | 				if(isset($tmp["initialState"]["serpList"])){ | ||||||
|  | 					 | ||||||
|  | 					$data = $tmp; | ||||||
|  | 					break; | ||||||
|  | 				} | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		if($data === null){ | ||||||
|  | 			 | ||||||
|  | 			throw new Exception("Failed to extract JSON"); | ||||||
|  | 		} | ||||||
|  | 		 | ||||||
|  | 		foreach($data["initialState"]["serpList"]["items"]["entities"] as $image){ | ||||||
| 			 | 			 | ||||||
| 			$title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)]; | 			$title = [html_entity_decode($image["snippet"]["title"], ENT_QUOTES | ENT_HTML5)]; | ||||||
| 			 | 			 | ||||||
| @ -738,7 +751,7 @@ class yandex{ | |||||||
| 				"url" => htmlspecialchars_decode($image["snippet"]["url"]) | 				"url" => htmlspecialchars_decode($image["snippet"]["url"]) | ||||||
| 			]; | 			]; | ||||||
| 			 | 			 | ||||||
| 			foreach($image["dups"] as $dup){ | 			foreach($image["viewerData"]["dups"] as $dup){ | ||||||
| 				 | 				 | ||||||
| 				$tmp["source"][]  = [ | 				$tmp["source"][]  = [ | ||||||
| 					"url" => htmlspecialchars_decode($dup["url"]), | 					"url" => htmlspecialchars_decode($dup["url"]), | ||||||
| @ -752,10 +765,10 @@ class yandex{ | |||||||
| 					preg_replace( | 					preg_replace( | ||||||
| 						'/^\/\//', | 						'/^\/\//', | ||||||
| 						"https://", | 						"https://", | ||||||
| 						htmlspecialchars_decode($image["thumb"]["url"]) | 						htmlspecialchars_decode($image["viewerData"]["thumb"]["url"]) | ||||||
| 					), | 					), | ||||||
| 				"width" => (int)$image["thumb"]["size"]["width"], | 				"width" => (int)$image["viewerData"]["thumb"]["size"]["width"], | ||||||
| 				"height" => (int)$image["thumb"]["size"]["height"] | 				"height" => (int)$image["viewerData"]["thumb"]["size"]["height"] | ||||||
| 			]; | 			]; | ||||||
| 			 | 			 | ||||||
| 			$out["image"][] = $tmp; | 			$out["image"][] = $tmp; | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 lolcat
						lolcat