PHP class function for screen scraping
10
Jan
I’ve updated my simple PHP function (the one to replace fopen()) for grabbing URLs using cURL. I’ve added some features and made it a class instead of a straight PHP function. One improvement is the ability to normalize URLs so you can use relative URLs. It also has more error checking and uses a standard user-agent by default.
The syntax is a little different from the previous version. To use it, you create an instance of the object then call the proper method:
-
$urlScoop = new UrlGrabber;
-
$rawhtml=$urlScoop->_get($urlScoop->_normalize("https://www.simmonsconsulting.com/"));
Fetching a relative url would look like this:
-
$urlScoop = new UrlGrabber;
-
$rawhtml=$urlScoop->_get($urlScoop->_normalize("../../Photos/"));
The function is included in the jump.
-
class UrlGrabber {
-
function _get($url) {
-
$this->content="";
-
$this->info="";
-
$ch = curl_init ();
-
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
-
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
-
curl_setopt ($ch, CURLOPT_URL, $url);
-
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
-
$this->content = curl_exec ($ch);
-
// Check for success, if anything weird happened append a note
-
$this->info = curl_getinfo($ch);
-
if ($this->content === false || $this->info['http_code'] != 200) {
-
if($this->content !== false )
-
$this->content .= "\n\n";
-
$this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
-
if (curl_error($ch))
-
$this->content .= "\n". curl_error($ch);
-
}
-
curl_close ($ch);
-
return($this->content);
-
}
-
-
function _post($url,$vars) {
-
$this->content="";
-
$ch = curl_init ();
-
curl_setopt ($ch, CURLOPT_URL, $url);
-
curl_setopt ($ch, CURLOPT_HEADER, 0);
-
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
-
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
-
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
-
curl_setopt ($ch, CURLOPT_POST, 1);
-
curl_setopt ($ch, CURLOPT_POSTFIELDS, $vars);
-
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
-
$this->content = curl_exec ($ch);
-
// Check for success, if anything weird happened append a note
-
$this->info = curl_getinfo($ch);
-
if ($this->content === false || $this->info['http_code'] != 200) {
-
if($this->content !== false )
-
$this->content .= "\n\n";
-
$this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
-
if (curl_error($ch))
-
$this->content .= "\n". curl_error($ch);
-
}
-
curl_close ($ch);
-
return($this->content);
-
}
-
-
function _normalize($url) {
-
// is the url absolute?
-
if(preg_match("/^https?:\/\//", $url)) {
-
return($url);
-
} // is the url locally absolute?
-
elseif(preg_match("/^\//", $url)) {
-
$url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
-
$url_root .= isset($_SERVER['HTTP_HOST']) && strlen($_SERVER['HTTP_HOST'])>0 ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'];
-
return($url_root . $url);
-
} // the url must be relative
-
else {
-
$slash = '/';
-
$url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
-
$url_root .= $_SERVER['HTTP_HOST'];
-
$self = $_SERVER['PHP_SELF'];
-
$self = str_replace('\\', $slash, $self);
-
$phys_path = getcwd();
-
$phys_path = str_replace('\\', $slash, $phys_path);
-
$file_path_array = explode ($slash, $self);
-
$file_name = array_pop($file_path_array);
-
$cwd_path_array = explode ($slash, $phys_path);
-
$self_dir_path_array = array_values (array_intersect ($file_path_array, $cwd_path_array));
-
$self_dir_path = implode("/", $self_dir_path_array)."/";
-
$cwd_url = $url_root . '/' . $self_dir_path;
-
return($cwd_url . $url);
-
}
-
}
-
}