2230 lines
		
	
	
		
			42 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
			
		
		
	
	
			2230 lines
		
	
	
		
			42 KiB
		
	
	
	
		
			PHP
		
	
	
	
	
	
| <?php
 | ||
| 
 | ||
| class baidu{
 | ||
| 	
 | ||
| 	public function __construct(){
 | ||
| 		
 | ||
| 		include "lib/backend.php";
 | ||
| 		$this->backend = new backend("baidu");
 | ||
| 		
 | ||
| 		include "lib/fuckhtml.php";
 | ||
| 		$this->fuckhtml = new fuckhtml();
 | ||
| 		
 | ||
| 		$this->handles = [];
 | ||
| 		$this->proc = null;
 | ||
| 		$this->handle_category = null;
 | ||
| 		$this->handle_increment = 0;
 | ||
| 		$this->sublink_increment = 0;
 | ||
| 		
 | ||
| 		$this->cookie = null;
 | ||
| 	}
 | ||
| 	
 | ||
| 	public function getfilters($page){
 | ||
| 		
 | ||
| 		switch($page){
 | ||
| 			
 | ||
| 			case "web":
 | ||
| 				return
 | ||
| 					[
 | ||
| 						"newer" => [
 | ||
| 							"display" => "Newer than",
 | ||
| 							"option" => "_DATE"
 | ||
| 						],
 | ||
| 						"older" => [
 | ||
| 							"display" => "Older than",
 | ||
| 							"option" => "_DATE"
 | ||
| 						]
 | ||
| 					];
 | ||
| 				break;
 | ||
| 			
 | ||
| 			case "images":
 | ||
| 				return
 | ||
| 					[
 | ||
| 						"sort" => [
 | ||
| 							"display" => "Sort",
 | ||
| 							"option" => [
 | ||
| 								"relevance" => "Relevance", // no param
 | ||
| 								"latest" => "Latest", // &latest=1
 | ||
| 								"hot" => "Hot" // &hot=1
 | ||
| 							]
 | ||
| 						],
 | ||
| 						"size" => [
 | ||
| 							"display" => "Size",
 | ||
| 							"option" => [
 | ||
| 								"any" => "Any size",
 | ||
| 								"7" => "Extra large (1080px+)", // &z=7
 | ||
| 								"6" => "Large (600px~1080px)", // &z=6
 | ||
| 								"5" => "Medium (300px~600px)", // &z=5
 | ||
| 								"4" => "Small (1px~300px)" // &z=4
 | ||
| 							]
 | ||
| 						],
 | ||
| 						"ratio" => [
 | ||
| 							"display" => "Ratio",
 | ||
| 							"option" => [
 | ||
| 								"any" => "Any ratio",
 | ||
| 								"1" => "Tall vertical", // &imgratio=1
 | ||
| 								"2" => "Vertical", // &imgratio=2
 | ||
| 								"3" => "Square", // &imgratio=3
 | ||
| 								"4" => "Horizontal", // &imgratio=4
 | ||
| 								"5" => "Wide horizontal" // &imgratio=5
 | ||
| 							]
 | ||
| 						],
 | ||
| 						"format" => [
 | ||
| 							"display" => "Format",
 | ||
| 							"option" => [
 | ||
| 								"any" => "Any format",
 | ||
| 								"3" => "JPG", // &imgformat=3
 | ||
| 								"5" => "JPEG", // &imgformat=5
 | ||
| 								"4" => "PNG", // &imgformat=4
 | ||
| 								"2" => "BMP", // &imgformat=2
 | ||
| 								"6" => "GIF (Animated)" // &imgformat=6
 | ||
| 							]
 | ||
| 						],
 | ||
| 						"color" => [
 | ||
| 							"display" => "Color",
 | ||
| 							"option" => [
 | ||
| 								"any" => "Any color",
 | ||
| 								"1024" => "White", // &ic=1024
 | ||
| 								"2048" => "Black & White",
 | ||
| 								"512" => "Black",
 | ||
| 								"64" => "Magenta",
 | ||
| 								"16" => "Blue",
 | ||
| 								"1" => "Red",
 | ||
| 								"2" => "Yellow",
 | ||
| 								"32" => "Purple",
 | ||
| 								"4" => "Green",
 | ||
| 								"8" => "Teal",
 | ||
| 								"256" => "Orange",
 | ||
| 								"128" => "Brown"
 | ||
| 							]
 | ||
| 						],
 | ||
| 						"type" => [
 | ||
| 							"display" => "Type",
 | ||
| 							"option" => [
 | ||
| 								"any" => "Any type",
 | ||
| 								"hd" => "HD", // &hd=1
 | ||
| 								"isImgSet" => "Photo album", // &isImgSet=1
 | ||
| 								"copyright" => "Copyright" // ©right=1
 | ||
| 							]
 | ||
| 						]
 | ||
| 					];
 | ||
| 				break;
 | ||
| 			
 | ||
| 			case "videos":
 | ||
| 				return [];
 | ||
| 				break;
 | ||
| 			
 | ||
| 			case "news":
 | ||
| 				return [
 | ||
| 					"category" => [
 | ||
| 						"display" => "Category",
 | ||
| 						"option" => [
 | ||
| 							"any" => "All news",
 | ||
| 							"media" => "Media websites", // &medium=1
 | ||
| 							"baijiahao" => "Baidu Baijiahao" // &medium=2
 | ||
| 						]
 | ||
| 					]
 | ||
| 				];
 | ||
| 				break;
 | ||
| 		}
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function get($proxy, $url, $get = [], $referer = false){
 | ||
| 		
 | ||
| 		$curlproc = curl_init();
 | ||
| 		
 | ||
| 		if($get !== []){
 | ||
| 			$get = http_build_query($get);
 | ||
| 			$url .= "?" . $get;
 | ||
| 		}
 | ||
| 		
 | ||
| 		$cookies_tmp = [];
 | ||
| 		curl_setopt($curlproc, CURLOPT_HEADERFUNCTION, function($curlproc, $header) use (&$cookies_tmp){
 | ||
| 			
 | ||
| 			$length = strlen($header);
 | ||
| 			
 | ||
| 			$header = explode(":", $header, 2);
 | ||
| 			
 | ||
| 			if(trim(strtolower($header[0])) == "set-cookie"){
 | ||
| 				
 | ||
| 				$cookie_tmp = explode("=", trim($header[1]), 2);
 | ||
| 				
 | ||
| 				$cookies_tmp[trim($cookie_tmp[0])] =
 | ||
| 					explode(";", $cookie_tmp[1], 2)[0];
 | ||
| 			}
 | ||
| 			
 | ||
| 			return $length;
 | ||
| 		});
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_URL, $url);
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||
| 		
 | ||
| 		if($referer === false){
 | ||
| 			if($this->cookie === null){
 | ||
| 				
 | ||
| 				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | ||
| 					["User-Agent: " . config::USER_AGENT,
 | ||
| 					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 | ||
| 					"Accept-Language: en-US,en;q=0.5",
 | ||
| 					"Accept-Encoding: gzip, deflate, br, zstd",
 | ||
| 					"DNT: 1",
 | ||
| 					"Sec-GPC: 1",
 | ||
| 					"Connection: keep-alive",
 | ||
| 					"Upgrade-Insecure-Requests: 1",
 | ||
| 					"Sec-Fetch-Dest: document",
 | ||
| 					"Sec-Fetch-Mode: navigate",
 | ||
| 					"Sec-Fetch-Site: cross-site",
 | ||
| 					"Priority: u=0, i"]
 | ||
| 				);
 | ||
| 			}else{
 | ||
| 				
 | ||
| 				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | ||
| 					["User-Agent: " . config::USER_AGENT,
 | ||
| 					"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 | ||
| 					"Accept-Language: en-US,en;q=0.5",
 | ||
| 					"Accept-Encoding: gzip, deflate, br, zstd",
 | ||
| 					"DNT: 1",
 | ||
| 					"Sec-GPC: 1",
 | ||
| 					"Connection: keep-alive",
 | ||
| 					"Cookie: {$this->cookie}",
 | ||
| 					"Upgrade-Insecure-Requests: 1",
 | ||
| 					"Sec-Fetch-Dest: document",
 | ||
| 					"Sec-Fetch-Mode: navigate",
 | ||
| 					"Sec-Fetch-Site: cross-site",
 | ||
| 					"Priority: u=0, i"]
 | ||
| 				);
 | ||
| 			}
 | ||
| 		}else{
 | ||
| 			
 | ||
| 			if($this->cookie === null){
 | ||
| 				
 | ||
| 				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | ||
| 					["User-Agent: " . config::USER_AGENT,
 | ||
| 					"Accept: application/json, text/plain, */*",
 | ||
| 					"Accept-Language: en-US,en;q=0.5",
 | ||
| 					"Accept-Encoding: gzip, deflate, br, zstd",
 | ||
| 					"Referer: {$referer}",
 | ||
| 					"DNT: 1",
 | ||
| 					"Sec-GPC: 1",
 | ||
| 					"Connection: keep-alive",
 | ||
| 					"Upgrade-Insecure-Requests: 1",
 | ||
| 					"Sec-Fetch-Dest: empty",
 | ||
| 					"Sec-Fetch-Mode: cors",
 | ||
| 					"Sec-Fetch-Site: same-origin"]
 | ||
| 				);
 | ||
| 			}else{
 | ||
| 				
 | ||
| 				curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | ||
| 					["User-Agent: " . config::USER_AGENT,
 | ||
| 					"Accept: application/json, text/plain, */*",
 | ||
| 					"Accept-Language: en-US,en;q=0.5",
 | ||
| 					"Accept-Encoding: gzip, deflate, br, zstd",
 | ||
| 					"Referer: {$referer}",
 | ||
| 					"DNT: 1",
 | ||
| 					"Sec-GPC: 1",
 | ||
| 					"Connection: keep-alive",
 | ||
| 					"Cookie: {$this->cookie}",
 | ||
| 					"Upgrade-Insecure-Requests: 1",
 | ||
| 					"Sec-Fetch-Dest: empty",
 | ||
| 					"Sec-Fetch-Mode: cors",
 | ||
| 					"Sec-Fetch-Site: same-origin"]
 | ||
| 				);
 | ||
| 			}
 | ||
| 		}
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
 | ||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 | ||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 | ||
| 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
 | ||
| 		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
 | ||
| 		
 | ||
| 		$this->backend->assign_proxy($curlproc, $proxy);
 | ||
| 		
 | ||
| 		$data = curl_exec($curlproc);
 | ||
| 		
 | ||
| 		if(curl_errno($curlproc)){
 | ||
| 			
 | ||
| 			throw new Exception(curl_error($curlproc));
 | ||
| 		}
 | ||
| 		
 | ||
| 		// store cookie
 | ||
| 		if(strlen($this->cookie) !== 0){
 | ||
| 			
 | ||
| 			$this->cookie .= "; ";
 | ||
| 		}
 | ||
| 		
 | ||
| 		foreach($cookies_tmp as $cookie_name => $cookie_value){
 | ||
| 			
 | ||
| 			$this->cookie .= $cookie_name . "=" . $cookie_value . "; ";
 | ||
| 		}
 | ||
| 		
 | ||
| 		$this->cookie = rtrim($this->cookie, " ;");
 | ||
| 		
 | ||
| 		curl_close($curlproc);
 | ||
| 		return $data;
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function redirect_add_url($proxy, $url){
 | ||
| 		
 | ||
| 		if(
 | ||
| 			preg_match(
 | ||
| 				'/^https?:\/\/(?:www\.)?baidu\.com\/link\?/',
 | ||
| 				$url
 | ||
| 			) === 0
 | ||
| 		){
 | ||
| 			
 | ||
| 			// not a baidu redirect
 | ||
| 			return;
 | ||
| 		}
 | ||
| 		
 | ||
| 		$curlproc = curl_init();
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_URL, $url);
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_ENCODING, ""); // default encoding
 | ||
| 		curl_setopt($curlproc, CURLOPT_HTTPHEADER,
 | ||
| 			["User-Agent: " . config::USER_AGENT,
 | ||
| 			"Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 | ||
| 			"Accept-Language: en-US,en;q=0.5",
 | ||
| 			"Accept-Encoding: gzip, deflate, br, zstd",
 | ||
| 			"DNT: 1",
 | ||
| 			"Sec-GPC: 1",
 | ||
| 			"Connection: keep-alive",
 | ||
| 			"Upgrade-Insecure-Requests: 1",
 | ||
| 			"Sec-Fetch-Dest: document",
 | ||
| 			"Sec-Fetch-Mode: navigate",
 | ||
| 			"Sec-Fetch-Site: none",
 | ||
| 			"Sec-Fetch-User: ?1",
 | ||
| 			"Priority: u=0, i"]
 | ||
| 		);
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_RETURNTRANSFER, true);
 | ||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYHOST, 2);
 | ||
| 		curl_setopt($curlproc, CURLOPT_SSL_VERIFYPEER, true);
 | ||
| 		curl_setopt($curlproc, CURLOPT_CONNECTTIMEOUT, 30);
 | ||
| 		curl_setopt($curlproc, CURLOPT_TIMEOUT, 30);
 | ||
| 		
 | ||
| 		curl_setopt($curlproc, CURLOPT_HEADER, true);
 | ||
| 		curl_setopt($curlproc, CURLOPT_NOBODY, true);
 | ||
| 		
 | ||
| 		$this->backend->assign_proxy($curlproc, $proxy);
 | ||
| 		
 | ||
| 		curl_multi_add_handle($this->proc, $curlproc);
 | ||
| 		$this->handles[$this->handle_category][$this->handle_increment][$this->sublink_increment] = $curlproc;
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function resolve_urls($proxy, &$collection, $categories){
 | ||
| 		
 | ||
| 		$this->proc = curl_multi_init();
 | ||
| 		curl_multi_select($this->proc);
 | ||
| 		
 | ||
| 		foreach($categories as $category){
 | ||
| 			
 | ||
| 			$this->sublink_increment = 0;
 | ||
| 			$this->handle_increment = 0;
 | ||
| 			$this->handle_category = $category;
 | ||
| 			
 | ||
| 			foreach($collection[$category] as $item){
 | ||
| 				
 | ||
| 				$this->sublink_increment = 0;
 | ||
| 				$this->redirect_add_url($proxy, $item["url"]);
 | ||
| 				
 | ||
| 				if(isset($item["sublink"])){
 | ||
| 					
 | ||
| 					foreach($item["sublink"] as $sublink){
 | ||
| 						
 | ||
| 						$this->sublink_increment++;
 | ||
| 						$this->redirect_add_url($proxy, $sublink["url"]);
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				$this->handle_increment++;
 | ||
| 			}
 | ||
| 		}
 | ||
| 		
 | ||
| 		do{
 | ||
| 			$status = curl_multi_exec($this->proc, $active);
 | ||
| 			
 | ||
| 		}while($active && $status == CURLM_OK);
 | ||
| 		
 | ||
| 		//
 | ||
| 		// if we reach this, we're done downloading garbage
 | ||
| 		//
 | ||
| 		
 | ||
| 		foreach($this->handles as $category => $v){
 | ||
| 			
 | ||
| 			foreach($v as $index => $data){
 | ||
| 				
 | ||
| 				foreach($this->handles[$category][$index] as $sublinkindex => $handle){
 | ||
| 					
 | ||
| 					preg_match(
 | ||
| 						'/location: ?(.*)$/im',
 | ||
| 						curl_multi_getcontent($handle),
 | ||
| 						$location
 | ||
| 					);
 | ||
| 					
 | ||
| 					if(isset($location[1])){
 | ||
| 						
 | ||
| 						if($sublinkindex === 0){
 | ||
| 							
 | ||
| 							$collection[$category][$index]["url"] = trim($location[1]);
 | ||
| 						}else{
 | ||
| 							
 | ||
| 							$collection[$category][$index]["sublink"][$sublinkindex - 1]["url"] = trim($location[1]);
 | ||
| 						}
 | ||
| 					}
 | ||
| 					
 | ||
| 					curl_multi_remove_handle($this->proc, $handle);
 | ||
| 					curl_close($handle);
 | ||
| 				}
 | ||
| 			}
 | ||
| 		}
 | ||
| 		
 | ||
| 		curl_multi_close($this->proc);
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function resolve_images($proxy, &$data){
 | ||
| 		
 | ||
| 		// get the image viewer that contains all of the images direct URLs
 | ||
| 		// for some reason, getting the second image's url in the set
 | ||
| 		// doesnt trigger the captcha
 | ||
| 		
 | ||
| 		if(
 | ||
| 			!isset($data["image"][1]["url"]) ||
 | ||
| 			preg_match(
 | ||
| 				'/^https:\/\/image\.baidu\.com\/search\/detail/',
 | ||
| 				$data["image"][1]["url"]
 | ||
| 			) === 0
 | ||
| 		){
 | ||
| 			
 | ||
| 			// we have an already resolved image link, do nothing
 | ||
| 			return;
 | ||
| 		}
 | ||
| 		
 | ||
| 		try{
 | ||
| 			
 | ||
| 			$html =
 | ||
| 				$this->get(
 | ||
| 					$proxy,
 | ||
| 					$data["image"][1]["url"],
 | ||
| 					[]
 | ||
| 				);
 | ||
| 		}catch(Exception $error){
 | ||
| 			
 | ||
| 			// fallback to the limited dataset we have
 | ||
| 			return;
 | ||
| 		}
 | ||
| 		
 | ||
| 		$this->fuckhtml->load($html);
 | ||
| 		
 | ||
| 		$script =
 | ||
| 			$this->fuckhtml
 | ||
| 			->getElementById(
 | ||
| 				"image-detail-data",
 | ||
| 				"script"
 | ||
| 			);
 | ||
| 		
 | ||
| 		if($script){
 | ||
| 			
 | ||
| 			$json =
 | ||
| 				json_decode(
 | ||
| 					$script["innerHTML"],
 | ||
| 					true
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(
 | ||
| 				!isset($json["data"]["images"]) ||
 | ||
| 				count($json["data"]["images"]) === 0
 | ||
| 			){
 | ||
| 				
 | ||
| 				// do nothing
 | ||
| 				return;
 | ||
| 			}
 | ||
| 			
 | ||
| 			//
 | ||
| 			// Discard all previously scraped images and use data
 | ||
| 			// from the newly downloaded image carousel
 | ||
| 			// the imageset !!should!! be the same
 | ||
| 			//
 | ||
| 			$data["image"] = [];
 | ||
| 			
 | ||
| 			foreach($json["data"]["images"] as $image){
 | ||
| 				
 | ||
| 				parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
 | ||
| 				
 | ||
| 				$data["image"][] = [
 | ||
| 					"title" =>
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$image["titleShow"]
 | ||
| 						),
 | ||
| 					"source" => [
 | ||
| 						[
 | ||
| 							"url" => $image["objurl"],
 | ||
| 							"width" => (int)$image["width"],
 | ||
| 							"height" => (int)$image["height"]
 | ||
| 						],
 | ||
| 						[ // thumbnail
 | ||
| 							"url" => $image["thumburl"],
 | ||
| 							"width" => (int)$thumb_size["w"],
 | ||
| 							"height" => (int)$thumb_size["h"]
 | ||
| 						]
 | ||
| 					],
 | ||
| 					"url" => $image["fromUrl"]
 | ||
| 				];
 | ||
| 			}
 | ||
| 		}
 | ||
| 	}
 | ||
| 	
 | ||
| 	public function web($get){
 | ||
| 		
 | ||
| 		if($get["npt"]){
 | ||
| 			
 | ||
| 			[$json, $proxy] = $this->backend->get($get["npt"], "web");
 | ||
| 			
 | ||
| 			$json = json_decode($json, true);
 | ||
| 			$this->cookie = $json["cookie"];
 | ||
| 			$npt_data = $json["req"];
 | ||
| 			
 | ||
| 			$npt_data["pn"] = $npt_data["pn"] + 20;
 | ||
| 			
 | ||
| 			try{
 | ||
| 				
 | ||
| 				$html = $this->get(
 | ||
| 					$proxy,
 | ||
| 					"https://www.baidu.com/s",
 | ||
| 					$npt_data
 | ||
| 				);
 | ||
| 			}catch(Exception $error){
 | ||
| 				
 | ||
| 				throw new Exception("Failed to fetch search page");
 | ||
| 			}
 | ||
| 			
 | ||
| 		}else{
 | ||
| 			
 | ||
| 			//
 | ||
| 			// Get authentication token
 | ||
| 			//
 | ||
| 			$proxy = $this->backend->get_ip();
 | ||
| 			
 | ||
| 			// running this will give us shit in $this->cookie
 | ||
| 			// @TODO probably not needed? I get blocked anyways ffs
 | ||
| 			//$this->get($proxy, "https://www.baidu.com", []);
 | ||
| 			
 | ||
| 			$npt_data = [
 | ||
| 				"wd" => $get["s"],
 | ||
| 				"rn" => 20
 | ||
| 			];
 | ||
| 			
 | ||
| 			// &gpc=stf%3D0%2C1752638400|stftype%3D2
 | ||
| 			if(
 | ||
| 				$get["older"] !== false ||
 | ||
| 				$get["newer"] !== false
 | ||
| 			){
 | ||
| 				
 | ||
| 				if($get["older"] === false){
 | ||
| 					
 | ||
| 					$get["older"] = 0;
 | ||
| 				}
 | ||
| 				
 | ||
| 				$npt_data["gpc"] = "stf={$get["older"]},{$get["newer"]}|stftype=2";
 | ||
| 			}
 | ||
| 			
 | ||
| 			try{
 | ||
| 				
 | ||
| 				$html = $this->get(
 | ||
| 					$proxy,
 | ||
| 					"https://www.baidu.com/s",
 | ||
| 					$npt_data
 | ||
| 				);
 | ||
| 			}catch(Exception $error){
 | ||
| 				
 | ||
| 				throw new Exception("Failed to fetch search page");
 | ||
| 			}
 | ||
| 			
 | ||
| 			$npt_data["pn"] = 0;
 | ||
| 		}
 | ||
| 		
 | ||
| 		return $this->parse_search($proxy, "web", $npt_data, $html);
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function parse_search($proxy, $pagetype, $npt_data, $html){
 | ||
| 		
 | ||
| 		// @HACK
 | ||
| 		// remove newlines from the html, cause it fucks with fuckhtml
 | ||
| 		$html = str_replace(["\n", "\r"], "", $html);
 | ||
| 		
 | ||
| 		$out = [
 | ||
| 			"status" => "ok",
 | ||
| 			"spelling" => [
 | ||
| 				"type" => "no_correction",
 | ||
| 				"using" => null,
 | ||
| 				"correction" => null
 | ||
| 			],
 | ||
| 			"npt" => null,
 | ||
| 			"answer" => [],
 | ||
| 			"web" => [],
 | ||
| 			"image" => [],
 | ||
| 			"video" => [],
 | ||
| 			"news" => [],
 | ||
| 			"related" => []
 | ||
| 		];
 | ||
| 		
 | ||
| 		$this->fuckhtml->load($html);
 | ||
| 		
 | ||
| 		$this->detect_ass();
 | ||
| 		
 | ||
| 		$datafields =
 | ||
| 			$this->fuckhtml
 | ||
| 			->getElementsByAttributeName(
 | ||
| 				"id",
 | ||
| 				"div"
 | ||
| 			);
 | ||
| 		
 | ||
| 		//
 | ||
| 		// Get next page
 | ||
| 		//
 | ||
| 		$npt =
 | ||
| 			$this->fuckhtml
 | ||
| 			->getElementsByClassName(
 | ||
| 				"n",
 | ||
| 				"a"
 | ||
| 			);
 | ||
| 		
 | ||
| 		if(count($npt) !== 0){
 | ||
| 			
 | ||
| 			$out["npt"] =
 | ||
| 				$this->backend->store(
 | ||
| 					json_encode([
 | ||
| 						"req" => $npt_data,
 | ||
| 						"cookie" => $this->cookie
 | ||
| 					]),
 | ||
| 					$pagetype,
 | ||
| 					$proxy
 | ||
| 				);
 | ||
| 		}
 | ||
| 		
 | ||
| 		//
 | ||
| 		// Get related searches
 | ||
| 		//
 | ||
| 		$related_container =
 | ||
| 			$this->fuckhtml
 | ||
| 			->getElementById(
 | ||
| 				"rs_new",
 | ||
| 				$datafields
 | ||
| 			);
 | ||
| 		
 | ||
| 		if($related_container){
 | ||
| 			
 | ||
| 			$this->fuckhtml->load($related_container);
 | ||
| 			
 | ||
| 			$as =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"c-color-link",
 | ||
| 					"a"
 | ||
| 				);
 | ||
| 			
 | ||
| 			foreach($as as $a){
 | ||
| 				
 | ||
| 				$text =
 | ||
| 					explode(
 | ||
| 						">",
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$a
 | ||
| 						),
 | ||
| 						2
 | ||
| 					);
 | ||
| 				
 | ||
| 				$out["related"][] = $text[count($text) - 1];
 | ||
| 			}
 | ||
| 		}
 | ||
| 		
 | ||
| 		foreach($datafields as $datafield){
 | ||
| 			
 | ||
| 			if(
 | ||
| 				!isset($datafield["attributes"]["id"]) ||
 | ||
| 				preg_match(
 | ||
| 					'/^[0-9]+$/',
 | ||
| 					$datafield["attributes"]["id"]
 | ||
| 				) === 0
 | ||
| 			){
 | ||
| 				
 | ||
| 				// not a search result
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 			
 | ||
| 			$this->fuckhtml->load($datafield);
 | ||
| 			$div =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByTagName(
 | ||
| 					"div"
 | ||
| 				);
 | ||
| 			
 | ||
| 			//
 | ||
| 			// Don't parse as a search result if it's a card
 | ||
| 			//
 | ||
| 			$card =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"cosc-card",
 | ||
| 					$div
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($card) !== 0){
 | ||
| 				
 | ||
| 				//
 | ||
| 				// Parse chinese youtube shorts
 | ||
| 				//
 | ||
| 				$ytshorts_probe =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"tts-b-item",
 | ||
| 						$div
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($ytshorts_probe) !== 0){
 | ||
| 					
 | ||
| 					$videos =
 | ||
| 						$this->fuckhtml
 | ||
| 						->getElementsByAttributeValue(
 | ||
| 							"data-show",
 | ||
| 							"list",
 | ||
| 							$div
 | ||
| 						);
 | ||
| 					
 | ||
| 					foreach($videos as $video){
 | ||
| 						
 | ||
| 						$this->fuckhtml->load($video);
 | ||
| 						
 | ||
| 						$title =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByClassName(
 | ||
| 								"cosc-title-slot",
 | ||
| 								"span"
 | ||
| 							);
 | ||
| 						
 | ||
| 						if(count($title) === 0){
 | ||
| 							
 | ||
| 							continue;
 | ||
| 						}
 | ||
| 						
 | ||
| 						$url =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByTagName(
 | ||
| 								"a"
 | ||
| 							);
 | ||
| 						
 | ||
| 						if(count($url) === 0){
 | ||
| 							
 | ||
| 							continue;
 | ||
| 						}
 | ||
| 						
 | ||
| 						$image =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByClassName(
 | ||
| 								"cos-image-body",
 | ||
| 								"img"
 | ||
| 							);
 | ||
| 						
 | ||
| 						if(count($image) === 0){
 | ||
| 							
 | ||
| 							$image = [
 | ||
| 								"ratio" => null,
 | ||
| 								"url" => null
 | ||
| 							];
 | ||
| 						}else{
 | ||
| 							
 | ||
| 							$image = [
 | ||
| 								"ratio" => "1:1",
 | ||
| 								"url" =>
 | ||
| 									$this->fuckhtml
 | ||
| 									->getTextContent(
 | ||
| 										$image[0]["attributes"]["src"]
 | ||
| 									)
 | ||
| 							];
 | ||
| 						}
 | ||
| 						
 | ||
| 						// get duration
 | ||
| 						$divs =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByAttributeName(
 | ||
| 								"class",
 | ||
| 								"div"
 | ||
| 							);
 | ||
| 						
 | ||
| 						$duration = null;
 | ||
| 						foreach($divs as $probe){
 | ||
| 							
 | ||
| 							if(strpos($probe["attributes"]["class"], "tag-bottom-right") !== false){
 | ||
| 								
 | ||
| 								$duration =
 | ||
| 									$this->hms2int(
 | ||
| 										$this->fuckhtml
 | ||
| 										->getTextContent(
 | ||
| 											$probe
 | ||
| 										)
 | ||
| 									);
 | ||
| 								break;
 | ||
| 							}
 | ||
| 						}
 | ||
| 						
 | ||
| 						$out["video"][] = [
 | ||
| 							"title" =>
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$title[0]
 | ||
| 								),
 | ||
| 							"description" => null,
 | ||
| 							"date" => null,
 | ||
| 							"duration" => $duration,
 | ||
| 							"views" => null,
 | ||
| 							"thumb" => $image,
 | ||
| 							"url" =>
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$url[0]["attributes"]["href"]
 | ||
| 								)
 | ||
| 						];
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				//
 | ||
| 				// Parse image carousel
 | ||
| 				//
 | ||
| 				$is_image_carousel = false;
 | ||
| 				foreach($div as $d){
 | ||
| 					
 | ||
| 					if(
 | ||
| 						isset($d["attributes"]["class"]) &&
 | ||
| 						strpos($d["attributes"]["class"], "image-container") !== false
 | ||
| 					){
 | ||
| 						
 | ||
| 						$is_image_carousel = true;
 | ||
| 						break;
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				if($is_image_carousel){
 | ||
| 					
 | ||
| 					preg_match(
 | ||
| 						'/<!--s-data:([\S\s]*)-->/U',
 | ||
| 						$datafield["innerHTML"],
 | ||
| 						$matches
 | ||
| 					);
 | ||
| 					
 | ||
| 					if(isset($matches[1])){
 | ||
| 						
 | ||
| 						// weird behavior with the smaller image carousel where --cos* CSS variables are escaped wrong
 | ||
| 						$json =
 | ||
| 							$this->fuckhtml
 | ||
| 							->parseJsObject(
 | ||
| 								str_replace(
 | ||
| 									"-\-",
 | ||
| 									"--",
 | ||
| 									$matches[1]
 | ||
| 								)
 | ||
| 							);
 | ||
| 						
 | ||
| 						if(
 | ||
| 							$json !== null &&
 | ||
| 							isset($json["imageList"][0]["images"])
 | ||
| 						){
 | ||
| 							
 | ||
| 							// parse image carousel
 | ||
| 							foreach($json["imageList"][0]["images"] as $image){
 | ||
| 								
 | ||
| 								parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
 | ||
| 								
 | ||
| 								$out["image"][] = [
 | ||
| 									"title" => "image",
 | ||
| 									"source" => [
 | ||
| 										[
 | ||
| 											"url" => $image["objurl"],
 | ||
| 											"width" => (int)$image["width"],
 | ||
| 											"height" => (int)$image["height"]
 | ||
| 										],
 | ||
| 										[ // thumbnail
 | ||
| 											"url" => $image["thumburl"],
 | ||
| 											"width" => (int)$thumb_size["w"],
 | ||
| 											"height" => (int)$thumb_size["h"]
 | ||
| 										]
 | ||
| 									],
 | ||
| 									"url" => $image["jumpUrl"]
 | ||
| 								];
 | ||
| 							}
 | ||
| 						}
 | ||
| 					}
 | ||
| 				}
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 			
 | ||
| 			if(!isset($datafield["attributes"]["mu"])){
 | ||
| 				
 | ||
| 				// dont scrape if we dont have the direct link
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 			
 | ||
| 			// class:FYB_RD -> News garbage, IGNORE
 | ||
| 			
 | ||
| 			$result =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"result",
 | ||
| 					[$datafield]
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($result) !== 0){
 | ||
| 				
 | ||
| 				//
 | ||
| 				// Parse normal search result
 | ||
| 				//
 | ||
| 				
 | ||
| 				$title =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"sc-link",
 | ||
| 						"a"
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($title) === 0){
 | ||
| 					
 | ||
| 					// should not happen
 | ||
| 					continue;
 | ||
| 				}
 | ||
| 				
 | ||
| 				$title =
 | ||
| 					$this->titledots(
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$title[0]
 | ||
| 						)
 | ||
| 					);
 | ||
| 				
 | ||
| 				$description =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"c-color",
 | ||
| 						$div
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($description) !== 0){
 | ||
| 					
 | ||
| 					$this->fuckhtml->load($description[0]);
 | ||
| 					
 | ||
| 					$description =
 | ||
| 						$this->fuckhtml
 | ||
| 						->getElementsByAttributeName(
 | ||
| 							"class",
 | ||
| 							"span"
 | ||
| 						);
 | ||
| 					
 | ||
| 					$found_desc = false;
 | ||
| 					foreach($description as $desc){
 | ||
| 						
 | ||
| 						if(stripos($desc["attributes"]["class"], "summary-text") !== false){
 | ||
| 							
 | ||
| 							$found_desc = true;
 | ||
| 							$description =
 | ||
| 								$this->titledots(
 | ||
| 									$this->fuckhtml
 | ||
| 									->getTextContent(
 | ||
| 										$desc
 | ||
| 									)
 | ||
| 								);
 | ||
| 							break;
 | ||
| 						}
 | ||
| 					}
 | ||
| 					
 | ||
| 					if($found_desc === false){
 | ||
| 						
 | ||
| 						$description = null;
 | ||
| 					}
 | ||
| 					
 | ||
| 					$this->fuckhtml->load($datafield);
 | ||
| 				}else{
 | ||
| 					
 | ||
| 					$description = null;
 | ||
| 				}
 | ||
| 				
 | ||
| 				// parse date
 | ||
| 				$date_probe =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"cos-color-text-minor",
 | ||
| 						"span"
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($date_probe) !== 0){
 | ||
| 					
 | ||
| 					$date =
 | ||
| 						$this->parse_time(
 | ||
| 							$this->fuckhtml
 | ||
| 							->getTextContent(
 | ||
| 								$date_probe[0]
 | ||
| 							)
 | ||
| 						);
 | ||
| 				}else{
 | ||
| 					
 | ||
| 					$date = null;
 | ||
| 				}
 | ||
| 				
 | ||
| 				// parse image
 | ||
| 				$img =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByTagName(
 | ||
| 						"img"
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($img) !== 0){
 | ||
| 					
 | ||
| 					$image = [
 | ||
| 						"ratio" => "16:9",
 | ||
| 						"url" =>
 | ||
| 							$this->unfuckthumb(
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$img[0]["attributes"]["src"]
 | ||
| 								)
 | ||
| 							)
 | ||
| 					];
 | ||
| 				}else{
 | ||
| 					
 | ||
| 					$image = [
 | ||
| 						"ratio" => null,
 | ||
| 						"url" => null
 | ||
| 					];
 | ||
| 				}
 | ||
| 				
 | ||
| 				// get page type
 | ||
| 				$pagetype_probe =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByTagName(
 | ||
| 						"b"
 | ||
| 					);
 | ||
| 				
 | ||
| 				$pagetype = "web";
 | ||
| 				foreach($pagetype_probe as $probe){
 | ||
| 					
 | ||
| 					$pagetype =
 | ||
| 						strtolower(
 | ||
| 							trim(
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$probe
 | ||
| 								),
 | ||
| 								" 【】"
 | ||
| 							)
 | ||
| 						);
 | ||
| 				}
 | ||
| 				
 | ||
| 				// get extra links
 | ||
| 				$sublinks = [];
 | ||
| 				
 | ||
| 				foreach($div as $d){
 | ||
| 					
 | ||
| 					if(
 | ||
| 						isset($d["attributes"]["class"]) &&
 | ||
| 						strpos($d["attributes"]["class"], "exta-link") !== false
 | ||
| 					){
 | ||
| 						
 | ||
| 						$this->fuckhtml->load($d);
 | ||
| 						
 | ||
| 						$links =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByClassName(
 | ||
| 								"cos-space-mt-xs",
 | ||
| 								"div"
 | ||
| 							);
 | ||
| 						
 | ||
| 						foreach($links as $link){
 | ||
| 							
 | ||
| 							$this->fuckhtml->load($link);
 | ||
| 							$s_title =
 | ||
| 								$this->fuckhtml
 | ||
| 								->getElementsByTagName(
 | ||
| 									"h3"
 | ||
| 								);
 | ||
| 							
 | ||
| 							if(count($s_title) === 0){
 | ||
| 								
 | ||
| 								// should not happen
 | ||
| 								continue;
 | ||
| 							}
 | ||
| 							
 | ||
| 							$data2 =
 | ||
| 								json_decode(
 | ||
| 									$this->fuckhtml
 | ||
| 									->getTextContent(
 | ||
| 										$s_title[0]["attributes"]["data-click"]
 | ||
| 									),
 | ||
| 									true
 | ||
| 								);
 | ||
| 							
 | ||
| 							if(!isset($data2["clk_info"])){
 | ||
| 								
 | ||
| 								// wtf
 | ||
| 								continue;
 | ||
| 							}
 | ||
| 							
 | ||
| 							$data2 =
 | ||
| 								json_decode(
 | ||
| 									$data2["clk_info"],
 | ||
| 									true
 | ||
| 								);
 | ||
| 							
 | ||
| 							if(!isset($data2["url"])){
 | ||
| 								
 | ||
| 								// no link, fuck off
 | ||
| 								continue;
 | ||
| 							}
 | ||
| 							
 | ||
| 							$url =
 | ||
| 								rawurldecode(
 | ||
| 									$data2["url"]
 | ||
| 								);
 | ||
| 							
 | ||
| 							$data =
 | ||
| 								$this->fuckhtml
 | ||
| 								->getElementsByTagName(
 | ||
| 									"p"
 | ||
| 								);
 | ||
| 							
 | ||
| 							$s_description = null;
 | ||
| 							
 | ||
| 							if(count($data) !== 0){
 | ||
| 								
 | ||
| 								$data =
 | ||
| 									json_decode(
 | ||
| 										$this->fuckhtml
 | ||
| 										->getTextContent(
 | ||
| 											$data[0]["attributes"]["sub-show-log"]
 | ||
| 										),
 | ||
| 										true
 | ||
| 									);
 | ||
| 								
 | ||
| 								if(isset($data["ext"]["content"])){
 | ||
| 									
 | ||
| 									$s_description = $data["ext"]["content"];
 | ||
| 								}
 | ||
| 							}
 | ||
| 							
 | ||
| 							$sublinks[] = [
 | ||
| 								"title" =>
 | ||
| 									$this->fuckhtml
 | ||
| 									->getTextContent(
 | ||
| 										$s_title[0]
 | ||
| 									),
 | ||
| 								"description" => $s_description,
 | ||
| 								"url" => $url,
 | ||
| 								"date" => null
 | ||
| 							];
 | ||
| 						}
 | ||
| 						break;
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				$out["web"][] = [
 | ||
| 					"title" => $title,
 | ||
| 					"description" => $description,
 | ||
| 					"url" =>
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$datafield["attributes"]["mu"]
 | ||
| 						),
 | ||
| 					"date" => $date,
 | ||
| 					"type" => $pagetype,
 | ||
| 					"thumb" => $image,
 | ||
| 					"sublink" => $sublinks,
 | ||
| 					"table" => []
 | ||
| 				];
 | ||
| 				
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 			
 | ||
| 			// parse special result
 | ||
| 			$result =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"result-op",
 | ||
| 					[$datafield]
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($result) !== 0){
 | ||
| 				
 | ||
| 				//
 | ||
| 				// Parse video carousel
 | ||
| 				//
 | ||
| 				if(
 | ||
| 					isset($datafield["attributes"]["tpl"]) &&
 | ||
| 					stripos($datafield["attributes"]["tpl"], "video") !== false
 | ||
| 				){
 | ||
| 					
 | ||
| 					preg_match(
 | ||
| 						'/<!--s-data:([\S\s]*)-->/U',
 | ||
| 						$datafield["innerHTML"],
 | ||
| 						$matches
 | ||
| 					);
 | ||
| 					
 | ||
| 					if(isset($matches[1])){
 | ||
| 					
 | ||
| 						$json =
 | ||
| 							json_decode(
 | ||
| 								$matches[1],
 | ||
| 								true
 | ||
| 							);
 | ||
| 						
 | ||
| 						if($json !== null){
 | ||
| 							
 | ||
| 							foreach($json["videoList"] as $video){
 | ||
| 								
 | ||
| 								$out["video"][] = [
 | ||
| 									"title" => $video["title"],
 | ||
| 									"description" =>
 | ||
| 										$this->titledots(
 | ||
| 											$video["desc"]
 | ||
| 										),
 | ||
| 									"date" =>
 | ||
| 										$this->parse_time(
 | ||
| 											$video["pubTime"]
 | ||
| 										),
 | ||
| 									"duration" =>
 | ||
| 										$this->hms2int(
 | ||
| 											$video["duration"]
 | ||
| 										),
 | ||
| 									"views" =>
 | ||
| 										$this->parse_viewcount(
 | ||
| 											$video["playCount"]
 | ||
| 										),
 | ||
| 									"thumb" => [
 | ||
| 										"ratio" => "16:9",
 | ||
| 										"url" => $video["poster"]
 | ||
| 									],
 | ||
| 									"url" => $video["bindProps"]["link"]
 | ||
| 								];
 | ||
| 							}
 | ||
| 						}
 | ||
| 					}
 | ||
| 					continue;
 | ||
| 				}
 | ||
| 				
 | ||
| 				//
 | ||
| 				// Special result div (wiki entries, rich divs)
 | ||
| 				//
 | ||
| 				$title =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByTagName(
 | ||
| 						"h3"
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($title) === 0){
 | ||
| 					
 | ||
| 					// should have a title somewhere
 | ||
| 					continue;
 | ||
| 				}
 | ||
| 				
 | ||
| 				$title =
 | ||
| 					explode(
 | ||
| 						">",
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$title[0]
 | ||
| 						),
 | ||
| 						2
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($title) === 2){
 | ||
| 					
 | ||
| 					$title = $title[1];
 | ||
| 				}else{
 | ||
| 					
 | ||
| 					$title = $title[0];
 | ||
| 				}
 | ||
| 				
 | ||
| 				// probe for wiki-like entry
 | ||
| 				$description =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"sc-paragraph",
 | ||
| 						"p"
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($description) === 0){
 | ||
| 					
 | ||
| 					// try and get grey description
 | ||
| 					$description =
 | ||
| 						$this->fuckhtml
 | ||
| 						->getElementsByClassName(
 | ||
| 							"c-color-gray2",
 | ||
| 							"p"
 | ||
| 						);
 | ||
| 					
 | ||
| 					if(count($description) === 0){
 | ||
| 						
 | ||
| 						// probe for special social media description
 | ||
| 						$description =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByClassName(
 | ||
| 								"c-color-text",
 | ||
| 								"div"
 | ||
| 							);
 | ||
| 						
 | ||
| 						if(isset($description[0]["attributes"]["aria-label"])){
 | ||
| 							
 | ||
| 							$description =
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$description[0]
 | ||
| 									["attributes"]
 | ||
| 									["aria-label"]
 | ||
| 								);
 | ||
| 						}else{
 | ||
| 							
 | ||
| 							// check for news tab description
 | ||
| 							$span =
 | ||
| 								$this->fuckhtml
 | ||
| 								->getElementsByClassName(
 | ||
| 									"c-font-normal",
 | ||
| 									"span"
 | ||
| 								);
 | ||
| 							
 | ||
| 							$description = null;
 | ||
| 							
 | ||
| 							foreach($span as $s){
 | ||
| 								
 | ||
| 								if(isset($s["attributes"]["aria-label"])){
 | ||
| 									
 | ||
| 									$description =
 | ||
| 										$this->titledots(
 | ||
| 											$this->fuckhtml
 | ||
| 											->getTextContent(
 | ||
| 												$span[count($span) - 1]
 | ||
| 											)
 | ||
| 										);
 | ||
| 									
 | ||
| 									break;
 | ||
| 								}
 | ||
| 							}
 | ||
| 						}
 | ||
| 					}else{
 | ||
| 						
 | ||
| 						$description =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getTextContent(
 | ||
| 								$description[0]
 | ||
| 							);
 | ||
| 					}
 | ||
| 					
 | ||
| 				}else{
 | ||
| 					
 | ||
| 					preg_match(
 | ||
| 						'/<!--s-text-->([\S\s]*)<!--\/s-text-->/U',
 | ||
| 						$description[count($description) - 1]["innerHTML"],
 | ||
| 						$matches
 | ||
| 					);
 | ||
| 					
 | ||
| 					if(isset($matches[1])){
 | ||
| 						
 | ||
| 						$description =
 | ||
| 							$this->titledots(
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$matches[1]
 | ||
| 								)
 | ||
| 							);
 | ||
| 					}else{
 | ||
| 						
 | ||
| 						$description = null;
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				// get thumbnail
 | ||
| 				$thumb =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByTagName(
 | ||
| 						"img"
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($thumb) !== 0){
 | ||
| 					
 | ||
| 					$thumb = [
 | ||
| 						"ratio" => "1:1",
 | ||
| 						"url" =>
 | ||
| 							$this->unfuckthumb(
 | ||
| 								$this->fuckhtml
 | ||
| 								->getTextContent(
 | ||
| 									$thumb[0]["attributes"]["src"]
 | ||
| 								)
 | ||
| 							)
 | ||
| 					];
 | ||
| 				}else{
 | ||
| 					
 | ||
| 					$thumb = [
 | ||
| 						"ratio" => null,
 | ||
| 						"url" => null
 | ||
| 					];
 | ||
| 				}
 | ||
| 				
 | ||
| 				// get sublinks
 | ||
| 				preg_match(
 | ||
| 					'/<!--s-data:([\S\s]*)-->/U',
 | ||
| 					$datafield["innerHTML"],
 | ||
| 					$matches
 | ||
| 				);
 | ||
| 				
 | ||
| 				$sublinks = [];
 | ||
| 				
 | ||
| 				if(isset($matches[1])){
 | ||
| 					
 | ||
| 					$json =
 | ||
| 						json_decode(
 | ||
| 							$matches[1],
 | ||
| 							true
 | ||
| 						);
 | ||
| 					
 | ||
| 					if($json !== null){
 | ||
| 						
 | ||
| 						if(isset($json["buttons"])){
 | ||
| 							
 | ||
| 							foreach($json["buttons"] as $button){
 | ||
| 								
 | ||
| 								$sublinks[] = [
 | ||
| 									"title" => $button["text"],
 | ||
| 									"description" => null,
 | ||
| 									"date" => null,
 | ||
| 									"url" => $button["url"]
 | ||
| 								];
 | ||
| 							}
 | ||
| 						}elseif(isset($json["mthreadList"])){
 | ||
| 							
 | ||
| 							foreach($json["mthreadList"] as $thread){
 | ||
| 								
 | ||
| 								$sublinks[] = [
 | ||
| 									"title" =>
 | ||
| 										$this->fuckhtml
 | ||
| 										->getTextContent(
 | ||
| 											$thread["title"]
 | ||
| 										),
 | ||
| 									"description" => null,
 | ||
| 									"date" => null,
 | ||
| 									"url" => $thread["ttsInfo"]["titleUrl"]
 | ||
| 								];
 | ||
| 							}
 | ||
| 						}
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				// get URL
 | ||
| 				// handle http://fakeurl.baidu.com bullshit
 | ||
| 				$url =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getTextContent(
 | ||
| 						$datafield["attributes"]["mu"]
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(
 | ||
| 					preg_match(
 | ||
| 						'/^https?:\/\/(?:fakeurl|nourl)(?:\.ubs)?\.baidu\.com/',
 | ||
| 						$url
 | ||
| 					)
 | ||
| 				){
 | ||
| 					
 | ||
| 					// we got some bullshit, get jumpUrl instead
 | ||
| 					$as =
 | ||
| 						$this->fuckhtml
 | ||
| 						->getElementsByTagName(
 | ||
| 							"a"
 | ||
| 						);
 | ||
| 					
 | ||
| 					if(count($as) !== 0){
 | ||
| 						
 | ||
| 						$url =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getTextContent(
 | ||
| 								$as[0]["attributes"]["href"]
 | ||
| 							);
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				// get xueshu sublinks
 | ||
| 				// get list
 | ||
| 				$xueshu_list =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"op-xueshu-links-d20-list",
 | ||
| 						$div
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($xueshu_list) !== 0){
 | ||
| 					
 | ||
| 					$this->fuckhtml->load($xueshu_list[0]);
 | ||
| 					
 | ||
| 					$rows =
 | ||
| 						$this->fuckhtml
 | ||
| 						->getElementsByClassName(
 | ||
| 							"c-row",
 | ||
| 							"div"
 | ||
| 						);
 | ||
| 					
 | ||
| 					// remove "read more" bullshit
 | ||
| 					foreach($rows as $row){
 | ||
| 						
 | ||
| 						if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
 | ||
| 							
 | ||
| 							$xueshu_list[0]["innerHTML"] =
 | ||
| 								str_replace(
 | ||
| 									$row["outerHTML"],
 | ||
| 									"",
 | ||
| 									$xueshu_list[0]["innerHTML"]
 | ||
| 								);
 | ||
| 						}
 | ||
| 					}
 | ||
| 					
 | ||
| 					$this->fuckhtml->load($xueshu_list[0]);
 | ||
| 					
 | ||
| 					foreach($rows as $row){
 | ||
| 						
 | ||
| 						$this->fuckhtml->load($row);
 | ||
| 						
 | ||
| 						if(strpos($row["attributes"]["class"], "op-xueshu-links-more") !== false){
 | ||
| 							
 | ||
| 							continue;
 | ||
| 						}
 | ||
| 						
 | ||
| 						$as =
 | ||
| 							$this->fuckhtml
 | ||
| 							->getElementsByTagName(
 | ||
| 								"a"
 | ||
| 							);
 | ||
| 						
 | ||
| 						foreach($as as $a){
 | ||
| 							
 | ||
| 							$sublinks[] = [
 | ||
| 								"title" =>
 | ||
| 									$this->titledots(
 | ||
| 										$this->fuckhtml
 | ||
| 										->getTextContent(
 | ||
| 											$a
 | ||
| 										)
 | ||
| 									),
 | ||
| 								"description" => null,
 | ||
| 								"date" => null,
 | ||
| 								"url" =>
 | ||
| 									$this->fuckhtml
 | ||
| 									->getTextContent(
 | ||
| 										$a["attributes"]["href"]
 | ||
| 									)
 | ||
| 							];
 | ||
| 						}
 | ||
| 					}
 | ||
| 				}
 | ||
| 				
 | ||
| 				$out["web"][] = [
 | ||
| 					"title" => $title,
 | ||
| 					"description" => $description,
 | ||
| 					"url" => $url,
 | ||
| 					"date" => null,
 | ||
| 					"type" => "web",
 | ||
| 					"thumb" => $thumb,
 | ||
| 					"sublink" => $sublinks,
 | ||
| 					"table" => []
 | ||
| 				];
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 		}
 | ||
| 		
 | ||
| 		//
 | ||
| 		// Remove tracking URLs and fetch additonal image resources
 | ||
| 		//
 | ||
| 		$this->resolve_urls($proxy, $out, ["web", "video"]);
 | ||
| 		$this->resolve_images($proxy, $out);
 | ||
| 		
 | ||
| 		return $out;
 | ||
| 	}
 | ||
| 	
 | ||
| 	public function image($get){
 | ||
| 		
 | ||
| 		// https://image.baidu.com/search/acjson?word=asmr&rn=60&pn=0&newReq=1
 | ||
| 		//$json = file_get_contents("scraper/baidu_img.json");
 | ||
| 		
 | ||
| 		if($get["npt"]){
 | ||
| 			
 | ||
| 			[$params, $proxy] = $this->backend->get($get["npt"], "images");
 | ||
| 			$params = json_decode($params, true);
 | ||
| 			
 | ||
| 			$params["pn"] = $params["pn"] + 60;
 | ||
| 			
 | ||
| 		}else{
 | ||
| 			
 | ||
| 			$proxy = $this->backend->get_ip();
 | ||
| 			$params = [
 | ||
| 				"word" => $get["s"],
 | ||
| 				"rn" => 60, // results/page
 | ||
| 				"pn" => 0, // item increment (0 * 60)
 | ||
| 				"newReq" => 1 // otherwise json is fucked up
 | ||
| 			];
 | ||
| 			
 | ||
| 			switch($get["sort"]){
 | ||
| 				
 | ||
| 				case "latest": $params["latest"] = 1; break;
 | ||
| 				case "hot": $params["hot"] = 1; break;
 | ||
| 			}
 | ||
| 			
 | ||
| 			if($get["size"] != "any"){
 | ||
| 				
 | ||
| 				$params["z"] = $get["size"];
 | ||
| 			}
 | ||
| 			
 | ||
| 			if($get["ratio"] != "any"){
 | ||
| 				
 | ||
| 				$params["imgratio"] = $get["ratio"];
 | ||
| 			}
 | ||
| 			
 | ||
| 			if($get["format"] != "any"){
 | ||
| 				
 | ||
| 				$params["imgformat"] = $get["format"];
 | ||
| 			}
 | ||
| 			
 | ||
| 			if($get["color"] != "any"){
 | ||
| 				
 | ||
| 				$params["ic"] = $get["color"];
 | ||
| 			}
 | ||
| 			
 | ||
| 			switch($get["type"]){
 | ||
| 				
 | ||
| 				case "hd": $params["hd"] = 1; break;
 | ||
| 				case "isImgSet": $params["isImgSet"] = 1; break;
 | ||
| 				case "copyright": $params["copyright"] = 1; break;
 | ||
| 			}
 | ||
| 		}
 | ||
| 		
 | ||
| 		try{
 | ||
| 				
 | ||
| 			$json =
 | ||
| 				$this->get(
 | ||
| 					$proxy,
 | ||
| 					"https://image.baidu.com/search/acjson",
 | ||
| 					$params,
 | ||
| 					"https://image.baidu.com/search/index?tn=baiduimage&word=" . urlencode($get["s"])
 | ||
| 				);
 | ||
| 		}catch(Exception $error){
 | ||
| 			
 | ||
| 			throw new Exception("Failed to fetch JSON");
 | ||
| 		}
 | ||
| 		
 | ||
| 		$json = json_decode($json, true);
 | ||
| 		
 | ||
| 		if($json === null){
 | ||
| 			
 | ||
| 			// detect captcha first			
 | ||
| 			$this->fuckhtml->load($json);
 | ||
| 			$this->detect_ass();
 | ||
| 			
 | ||
| 			// fallback to json decode error
 | ||
| 			throw new Exception("Failed to decode JSON");
 | ||
| 		}
 | ||
| 		
 | ||
| 		if(
 | ||
| 			isset($json["message"]) &&
 | ||
| 			$json["message"] != "success"
 | ||
| 		){
 | ||
| 			
 | ||
| 			throw new Exception("Baidu returned an error: {$json["message"]}");
 | ||
| 		}
 | ||
| 		
 | ||
| 		if(!isset($json["data"]["images"])){
 | ||
| 			
 | ||
| 			throw new Exception("Baidu did not return an image object");
 | ||
| 		}
 | ||
| 		
 | ||
| 		$out = [
 | ||
| 			"status" => "ok",
 | ||
| 			"npt" => null,
 | ||
| 			"image" => []
 | ||
| 		];
 | ||
| 		
 | ||
| 		foreach($json["data"]["images"] as $image){
 | ||
| 			
 | ||
| 			parse_str(parse_url($image["thumburl"], PHP_URL_QUERY), $thumb_size);
 | ||
| 			
 | ||
| 			$out["image"][] = [
 | ||
| 				"title" =>
 | ||
| 					$this->fuckhtml
 | ||
| 					->getTextContent(
 | ||
| 						$image["titleShow"]
 | ||
| 					),
 | ||
| 				"source" => [
 | ||
| 					[
 | ||
| 						"url" => $image["objurl"],
 | ||
| 						"width" => (int)$image["width"],
 | ||
| 						"height" => (int)$image["height"]
 | ||
| 					],
 | ||
| 					[ // thumbnail
 | ||
| 						"url" => $image["thumburl"],
 | ||
| 						"width" => (int)$thumb_size["w"],
 | ||
| 						"height" => (int)$thumb_size["h"]
 | ||
| 					]
 | ||
| 				],
 | ||
| 				"url" => $image["fromUrl"]
 | ||
| 			];
 | ||
| 		}
 | ||
| 		
 | ||
| 		//
 | ||
| 		// Detect if there's a next page
 | ||
| 		//
 | ||
| 		if((int)$json["data"]["totalNum"] >= $params["pn"] + 60){
 | ||
| 			
 | ||
| 			$out["npt"] =
 | ||
| 				$this->backend->store(
 | ||
| 					json_encode($params),
 | ||
| 					"images",
 | ||
| 					$proxy
 | ||
| 				);
 | ||
| 		}
 | ||
| 		
 | ||
| 		return $out;
 | ||
| 	}
 | ||
| 	
 | ||
| 	public function video($get){
 | ||
| 		
 | ||
| 		// https://www.baidu.com/sf/vsearch?pd=video&tn=vsearch&wd=jak%2Band%2Bdaxter&async=1&pn=0
 | ||
| 		// increase &pn +20 for pagination
 | ||
| 		
 | ||
| 		//$html = file_get_contents("scraper/baidu_vid.html");
 | ||
| 		
 | ||
| 		if($get["npt"]){
 | ||
| 			
 | ||
| 			[$params, $proxy] = $this->backend->get($get["npt"], "videos");
 | ||
| 			$params = json_decode($params, true);
 | ||
| 			
 | ||
| 			$params["pn"] = $params["pn"] + 10;
 | ||
| 		}else{
 | ||
| 			
 | ||
| 			$proxy = $this->backend->get_ip();
 | ||
| 			$params = [
 | ||
| 				"pd" => "video",
 | ||
| 				"tn" => "vsearch",
 | ||
| 				"wd" => $get["s"],
 | ||
| 				"async" => 1,
 | ||
| 				"pn" => 0
 | ||
| 			];
 | ||
| 		}
 | ||
| 		
 | ||
| 		try{
 | ||
| 			$html =
 | ||
| 				$this->get(
 | ||
| 					$proxy,
 | ||
| 					"https://www.baidu.com/sf/vsearch",
 | ||
| 					$params
 | ||
| 				);
 | ||
| 		}catch(Exception $error){
 | ||
| 			
 | ||
| 			throw new Exception("Failed to get search page");
 | ||
| 		}
 | ||
| 		
 | ||
| 		$html =
 | ||
| 			str_replace(
 | ||
| 				["\r", "\n"],
 | ||
| 				"",
 | ||
| 				$html
 | ||
| 			);
 | ||
| 		
 | ||
| 		$out = [
 | ||
| 			"status" => "ok",
 | ||
| 			"npt" => null,
 | ||
| 			"video" => [],
 | ||
| 			"author" => [],
 | ||
| 			"livestream" => [],
 | ||
| 			"playlist" => [],
 | ||
| 			"reel" => []
 | ||
| 		];
 | ||
| 		
 | ||
| 		$html = explode("<script>", $html);
 | ||
| 		
 | ||
| 		foreach($html as $result){
 | ||
| 			
 | ||
| 			$result = trim($result);
 | ||
| 			
 | ||
| 			$this->fuckhtml->load($result);
 | ||
| 			
 | ||
| 			// get URL
 | ||
| 			preg_match(
 | ||
| 				'/<!-- *([^ ]*) *-->/',
 | ||
| 				$result,
 | ||
| 				$matches
 | ||
| 			);
 | ||
| 			
 | ||
| 			if(!isset($matches[1])){
 | ||
| 				
 | ||
| 				// no link, give up
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 			
 | ||
| 			$link = $matches[1];
 | ||
| 			
 | ||
| 			// get title
 | ||
| 			$title =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"video-title",
 | ||
| 					"a"
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($title) === 0){
 | ||
| 				
 | ||
| 				// should not happen
 | ||
| 				continue;
 | ||
| 			}
 | ||
| 			
 | ||
| 			$title =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getTextContent(
 | ||
| 					$title[0]
 | ||
| 				);
 | ||
| 			
 | ||
| 			// get thumbnail
 | ||
| 			$img =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"border-radius",
 | ||
| 					"img"
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($img) !== 0){
 | ||
| 				
 | ||
| 				$thumb = [
 | ||
| 					"url" =>
 | ||
| 						$this->unfuckthumb(
 | ||
| 							$this->fuckhtml
 | ||
| 							->getTextContent(
 | ||
| 								$img[0]["attributes"]["src"]
 | ||
| 							)
 | ||
| 						),
 | ||
| 					"ratio" => "16:9"
 | ||
| 				];
 | ||
| 			}else{
 | ||
| 				
 | ||
| 				$thumb = [
 | ||
| 					"url" => null,
 | ||
| 					"ratio" => null
 | ||
| 				];
 | ||
| 			}
 | ||
| 			
 | ||
| 			$span =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByTagName(
 | ||
| 					"span"
 | ||
| 				);
 | ||
| 			
 | ||
| 			// get duration
 | ||
| 			$duration =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"video_play_timer",
 | ||
| 					$span
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($duration) !== 0){
 | ||
| 				
 | ||
| 				$duration =
 | ||
| 					$this->hms2int(
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$duration[0]
 | ||
| 						)
 | ||
| 					);
 | ||
| 			}else{
 | ||
| 				
 | ||
| 				$duration = null;
 | ||
| 			}
 | ||
| 			
 | ||
| 			// get author
 | ||
| 			// 来源:哔哩哔哩
 | ||
| 			$author =
 | ||
| 				$this->fuckhtml
 | ||
| 				->getElementsByClassName(
 | ||
| 					"wetSource",
 | ||
| 					$span
 | ||
| 				);
 | ||
| 			
 | ||
| 			if(count($author) !== 0){
 | ||
| 				
 | ||
| 				$author =
 | ||
| 					explode(
 | ||
| 						":",
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$author[0]
 | ||
| 						),
 | ||
| 						2
 | ||
| 					)[1];
 | ||
| 			}else{
 | ||
| 				
 | ||
| 				$author = null;
 | ||
| 			}
 | ||
| 			
 | ||
| 			// get date posted
 | ||
| 			//发布时间:2024-05-06
 | ||
| 			
 | ||
| 			// AND get description
 | ||
| 			// 简介:Our first look
 | ||
| 			$infospans =
 | ||
| 				array_merge(
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"c-font-normal",
 | ||
| 						$span
 | ||
| 					),
 | ||
| 					$this->fuckhtml
 | ||
| 					->getElementsByClassName(
 | ||
| 						"c-font-normal",
 | ||
| 						"div"
 | ||
| 					)
 | ||
| 				);
 | ||
| 			
 | ||
| 			$date = null;
 | ||
| 			$description = null;
 | ||
| 			
 | ||
| 			foreach($infospans as $infospan){
 | ||
| 				
 | ||
| 				$infospan =
 | ||
| 					explode(
 | ||
| 						":",
 | ||
| 						$this->fuckhtml
 | ||
| 						->getTextContent(
 | ||
| 							$infospan
 | ||
| 						),
 | ||
| 						2
 | ||
| 					);
 | ||
| 				
 | ||
| 				if(count($infospan) !== 2){
 | ||
| 					
 | ||
| 					// should not happen
 | ||
| 					continue;
 | ||
| 				}
 | ||
| 				
 | ||
| 				$infospan[1] =
 | ||
| 					$this->fuckhtml
 | ||
| 					->getTextContent(
 | ||
| 						$infospan[1]
 | ||
| 					);
 | ||
| 				
 | ||
| 				switch($infospan[0]){
 | ||
| 					
 | ||
| 					case "发布时间": // date posted
 | ||
| 						$date = $this->parse_time($infospan[1]);
 | ||
| 						break;
 | ||
| 					
 | ||
| 					case "简介": // description
 | ||
| 						$description = $infospan[1];
 | ||
| 						break;
 | ||
| 				}
 | ||
| 			}
 | ||
| 			
 | ||
| 			$out["video"][] = [
 | ||
| 				"title" => $this->titledots($title),
 | ||
| 				"description" => $this->titledots($description),
 | ||
| 				"author" => [
 | ||
| 					"name" => $author,
 | ||
| 					"url" => null,
 | ||
| 					"avatar" => null
 | ||
| 				],
 | ||
| 				"date" => $date,
 | ||
| 				"duration" => $duration,
 | ||
| 				"views" => null,
 | ||
| 				"thumb" => $thumb,
 | ||
| 				"url" => $link
 | ||
| 			];
 | ||
| 		}
 | ||
| 		
 | ||
| 		if(count($out["video"]) === 10){
 | ||
| 			
 | ||
| 			// assume there's another page after this
 | ||
| 			$out["npt"] =
 | ||
| 				$this->backend->store(
 | ||
| 					json_encode($params),
 | ||
| 					"videos",
 | ||
| 					$proxy
 | ||
| 				);
 | ||
| 		}
 | ||
| 		
 | ||
| 		return $out;
 | ||
| 	}
 | ||
| 	
 | ||
| 	public function news($get){
 | ||
| 		
 | ||
| 		//$proxy = $this->backend->get_ip();
 | ||
| 		//$html = file_get_contents("scraper/baidu.html");
 | ||
| 		//$npt_data = [];
 | ||
| 		
 | ||
| 		if($get["npt"]){
 | ||
| 			
 | ||
| 			[$json, $proxy] = $this->backend->get($get["npt"], "news");
 | ||
| 			
 | ||
| 			$json = json_decode($json, true);
 | ||
| 			$this->cookie = $json["cookie"];
 | ||
| 			$npt_data = $json["req"];
 | ||
| 			
 | ||
| 			$npt_data["pn"] = $npt_data["pn"] + 20;
 | ||
| 			
 | ||
| 			try{
 | ||
| 				
 | ||
| 				$html = $this->get(
 | ||
| 					$proxy,
 | ||
| 					"https://www.baidu.com/s",
 | ||
| 					$npt_data
 | ||
| 				);
 | ||
| 			}catch(Exception $error){
 | ||
| 				
 | ||
| 				throw new Exception("Failed to fetch search page");
 | ||
| 			}
 | ||
| 			
 | ||
| 		}else{
 | ||
| 			
 | ||
| 			//
 | ||
| 			// Get authentication token
 | ||
| 			//
 | ||
| 			$proxy = $this->backend->get_ip();
 | ||
| 			
 | ||
| 			$npt_data = [
 | ||
| 				"wd" => $get["s"],
 | ||
| 				"rn" => 20,
 | ||
| 				"tn" => "news"
 | ||
| 			];
 | ||
| 			
 | ||
| 			// @TODO add filters
 | ||
| 			
 | ||
| 			try{
 | ||
| 				
 | ||
| 				$html = $this->get(
 | ||
| 					$proxy,
 | ||
| 					"https://www.baidu.com/s",
 | ||
| 					$npt_data
 | ||
| 				);
 | ||
| 			}catch(Exception $error){
 | ||
| 				
 | ||
| 				throw new Exception("Failed to fetch search page");
 | ||
| 			}
 | ||
| 			
 | ||
| 			$npt_data["pn"] = 0;
 | ||
| 		}
 | ||
| 		
 | ||
| 		$data = $this->parse_search($proxy, "news", $npt_data, $html);
 | ||
| 		
 | ||
| 		$out = [
 | ||
| 			"status" => "ok",
 | ||
| 			"npt" => $data["npt"],
 | ||
| 			"news" => []
 | ||
| 		];
 | ||
| 		
 | ||
| 		foreach($data["web"] as $article){
 | ||
| 			
 | ||
| 			$out["news"][] = [
 | ||
| 				"title" => $article["title"],
 | ||
| 				"author" => null,
 | ||
| 				"description" => $article["description"],
 | ||
| 				"date" => $article["date"],
 | ||
| 				"thumb" => [
 | ||
| 					"url" => $article["thumb"]["url"],
 | ||
| 					"ratio" => $article["thumb"]["url"] !== null ? "16:9" : null,
 | ||
| 				],
 | ||
| 				"url" => $article["url"]
 | ||
| 			];
 | ||
| 		}
 | ||
| 		
 | ||
| 		return $out;
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function unfuckthumb($url){
 | ||
| 		
 | ||
| 		// probe for proxy URL
 | ||
| 		$parsed_url = parse_url($url);
 | ||
| 		if(
 | ||
| 			preg_match(
 | ||
| 				'/^https?:\/\/gimg(?:[0-9]+)?\.baidu\.com/',
 | ||
| 				$url
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			$parts = explode("src=", $url);
 | ||
| 			if(count($parts) !== 2){
 | ||
| 				
 | ||
| 				// shits fucked
 | ||
| 				return $url;
 | ||
| 			}
 | ||
| 			
 | ||
| 			return urldecode(explode("&", $parts[1])[0]);
 | ||
| 		}
 | ||
| 		
 | ||
| 		$q = explode("&", $url, 2);
 | ||
| 		
 | ||
| 		if(count($q) !== 2){
 | ||
| 			
 | ||
| 			// shits fucked, again
 | ||
| 			return $url;
 | ||
| 		}
 | ||
| 		
 | ||
| 		// baidu devs are fucking retarded and dont follow spec:
 | ||
| 		// &fmt=auto?s=BB32F3A050471AEC72886934030090C4&sec=1753203600&t=0fb2194775d3bd3d1bb114b818479e0a
 | ||
| 		parse_str(str_replace("?", "&", $q[1]), $query);
 | ||
| 		
 | ||
| 		if(isset($query["size"])){ unset($query["size"]); }
 | ||
| 		if(isset($query["q"])){ $query["q"] = "90"; }
 | ||
| 		
 | ||
| 		$query = http_build_query($query);
 | ||
| 		
 | ||
| 		return
 | ||
| 			str_replace(
 | ||
| 				$q[1],
 | ||
| 				$query,
 | ||
| 				$url
 | ||
| 			);
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function titledots($title){
 | ||
| 		
 | ||
| 		return trim($title, " .\t\n\r\0\x0B…");
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function hms2int($time){
 | ||
| 		
 | ||
| 		$parts = explode(":", $time, 3);
 | ||
| 		$time = 0;
 | ||
| 		
 | ||
| 		if(count($parts) === 3){
 | ||
| 			
 | ||
| 			// hours
 | ||
| 			$time = $time + ((int)$parts[0] * 3600);
 | ||
| 			array_shift($parts);
 | ||
| 		}
 | ||
| 		
 | ||
| 		if(count($parts) === 2){
 | ||
| 			
 | ||
| 			// minutes
 | ||
| 			$time = $time + ((int)$parts[0] * 60);
 | ||
| 			array_shift($parts);
 | ||
| 		}
 | ||
| 		
 | ||
| 		// seconds
 | ||
| 		$time = $time + (int)$parts[0];
 | ||
| 		
 | ||
| 		return $time;
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function parse_viewcount($views){
 | ||
| 		
 | ||
| 		if(
 | ||
| 			// 10k (wtf lol)
 | ||
| 			preg_match(
 | ||
| 				'/([0-9]+)万次/',
 | ||
| 				$views,
 | ||
| 				$matches
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			return (int)$matches[1] * 10000;
 | ||
| 		}
 | ||
| 		
 | ||
| 		if(
 | ||
| 			// units
 | ||
| 			preg_match(
 | ||
| 				'/([0-9]+)次/',
 | ||
| 				$views,
 | ||
| 				$matches
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			return (int)$matches[1];
 | ||
| 		}
 | ||
| 		
 | ||
| 		return null;
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function parse_time($time){
 | ||
| 		
 | ||
| 		// 2023年8月7日 => yyyy/m/d
 | ||
| 		if(
 | ||
| 			preg_match(
 | ||
| 				'/([0-9]{4})年([0-9]{1,2})月([0-9]{1,2})日/',
 | ||
| 				$time,
 | ||
| 				$matches
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			return strtotime("{$matches[1]}/{$matches[2]}/{$matches[3]}");
 | ||
| 		}
 | ||
| 		
 | ||
| 		// 昨天11:45 => yesterday at 11:45
 | ||
| 		// 昨天 => yesterday
 | ||
| 		if(
 | ||
| 			preg_match(
 | ||
| 				'/昨天(.*)/',
 | ||
| 				$time,
 | ||
| 				$matches
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			return strtotime("Yesterday {$matches[1]}");
 | ||
| 		}
 | ||
| 		
 | ||
| 		// 3天前 => 3 days ago
 | ||
| 		if(
 | ||
| 			preg_match(
 | ||
| 				'/([0-9]{1,4})天前/',
 | ||
| 				$time,
 | ||
| 				$matches
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			return strtotime("{$matches[1]} days ago");
 | ||
| 		}
 | ||
| 		
 | ||
| 		// 1个月前 => 1 month ago
 | ||
| 		if(
 | ||
| 			preg_match(
 | ||
| 				'/([0-9]{1,4})个月前/',
 | ||
| 				$time,
 | ||
| 				$matches
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			return strtotime("{$matches[1]} months ago");
 | ||
| 		}
 | ||
| 		
 | ||
| 		// attempt to parse as-is
 | ||
| 		$time = strtotime($time);
 | ||
| 		
 | ||
| 		if($time !== false){
 | ||
| 			
 | ||
| 			return $time;
 | ||
| 		}
 | ||
| 		
 | ||
| 		return null;
 | ||
| 	}
 | ||
| 	
 | ||
| 	private function detect_ass(){
 | ||
| 		
 | ||
| 		$as =
 | ||
| 			$this->fuckhtml
 | ||
| 			->getElementsByTagName(
 | ||
| 				"a"
 | ||
| 			);
 | ||
| 		
 | ||
| 		if(
 | ||
| 			count($as) === 0 ||
 | ||
| 			preg_match(
 | ||
| 				'/^https?:\/\/wappass\.baidu\.com\/static\/captcha/',
 | ||
| 				$this->fuckhtml
 | ||
| 				->getTextContent(
 | ||
| 					$as[0]["attributes"]["href"]
 | ||
| 				)
 | ||
| 			)
 | ||
| 		){
 | ||
| 			
 | ||
| 			throw new Exception("Baidu returned a Captcha");
 | ||
| 		}
 | ||
| 	}
 | ||
| }
 | 
