最簡單的方法就是 fopen / file_get_contents .. 等等有很多種方法,不過這些方法都會把整頁 HTML 讀取回來,若只是要判斷網址是否失效來說,速度就顯得有些緩慢!要判斷可以由 HTTP HEADER 來判斷,就不用把整頁的內容都抓回來!
可以用 get_headers() 得到這些資訊:
HTTP/1.1 200 OK
Date: Mon, 06 Oct 2008 15:45:27 GMT
Server: Apache/2.2.9
X-Powered-By: PHP/5.2.6-4
Set-Cookie: PHPSESSID=4e037868a4619d6b4d8c52d0d5c59035; path=/
Expires: Thu, 19 Nov 1981 08:52:00 GMT
Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0
Pragma: no-cache
Vary: Accept-Encoding
Connection: close
Content-Type: text/html
PHP + Curl + Content-Type 的判斷方式:
FUNCTION existsWebpage($url){
$parts = parse_url($url);
IF(!$parts){return false;} /* the URL was seriously wrong */
IF(isset($parts['user'])){return false;} /* user@gmail.com */
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
/* set the user agent - might help, doesn't hurt */
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; wowTreebot/1.0; +http://wowtree.com)');
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
/* try to follow redirects */
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
/* timeout after the specified number of seconds. assuming that this script runs on a server, 20 seconds should be plenty of time to verify a valid URL. */
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 15);
curl_setopt($ch, CURLOPT_TIMEOUT, 20);
/* don't download the page, just the header (much faster in this case) */
curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_HEADER, true);
/* handle HTTPS links */
IF($parts['scheme'] == 'https'){
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
}
$response = curl_exec($ch);
curl_close($ch);
/* allow content-type list */
$content_type = false;
IF(preg_match('/Content-Type: (.+\/.+?)/i', $response, $matches)){
switch ($matches[1]){
case 'application/atom+xml':
case 'application/rdf+xml':
case 'application/xhtml+xml':
case 'application/xml':
case 'application/xml-dtd':
case 'application/xml-external-parsed-entity':
$content_type = true;
break;
}
IF(!$content_type && (preg_match('/text\/.*/', $matches[1]) || preg_match('/image\/.*/', $matches[1]))){
$content_type = true;
}
}
IF(!$content_type){ return false;}
/* get the status code from HTTP headers */
IF(preg_match('/HTTP\/1\.\d+\s+(\d+)/', $response, $matches)){$code = intval($matches[1]);}
ELSE {return false;}
/* see if code indicates success */
return (($code >= 200) && ($code < 400));
}
- Keywords : curl_setopt, matches, false, application, return, content_type, preg_match, parts, response, Content, seconds, check, cache, 速度就顯得, 資訊, 讀取回來, 要判斷網址, 若只, 網址, 會把整頁