PHP class function for screen scraping
I’ve updated my simple PHP function (the one to replace fopen()) for grabbing URLs using cURL. I’ve added some features and made it a class instead of a straight PHP function. One improvement is the ability to normalize URLs so you can use relative URLs. It also has more error checking and uses a standard user-agent by default.
The syntax is a little different from the previous version. To use it, you create an instance of the object then call the proper method:
$urlScoop = new UrlGrabber;
Fetching a relative url would look like this:
$urlScoop = new UrlGrabber;
The function is included in the jump.
class UrlGrabber {
function _get($url) {
$ch = curl_init ();
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
$this->content = curl_exec ($ch);
// Check for success, if anything weird happened append a note
$this->info = curl_getinfo($ch);
if ($this->content === false || $this->info['http_code'] != 200) {
if($this->content !== false )
$this->content .= "\n\n";
$this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
if (curl_error($ch))
$this->content .= "\n". curl_error($ch);
curl_close ($ch);
function _post($url,$vars) {
$ch = curl_init ();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 0);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)");
curl_setopt ($ch, CURLOPT_POST, 1);
curl_setopt ($ch, CURLOPT_POSTFIELDS, $vars);
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
$this->content = curl_exec ($ch);
// Check for success, if anything weird happened append a note
$this->info = curl_getinfo($ch);
if ($this->content === false || $this->info['http_code'] != 200) {
if($this->content !== false )
$this->content .= "\n\n";
$this->content .= "HTTP status was abnormal for $url [". $this->info['http_code']. "]";
if (curl_error($ch))
$this->content .= "\n". curl_error($ch);
curl_close ($ch);
function _normalize($url) {
// is the url absolute?
if(preg_match("/^https?:\/\//", $url)) {
} // is the url locally absolute?
elseif(preg_match("/^\//", $url)) {
$url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
$url_root .= isset($_SERVER['HTTP_HOST']) && strlen($_SERVER['HTTP_HOST'])>0 ? $_SERVER['HTTP_HOST'] : $_SERVER['SERVER_NAME'];
return($url_root . $url);
} // the url must be relative
else {
$slash = '/';
$url_root = $_SERVER["HTTPS"] == "on" ? "https://" : "http://";
$url_root .= $_SERVER['HTTP_HOST'];
$self = $_SERVER['PHP_SELF'];
$self = str_replace('\\', $slash, $self);
$phys_path = getcwd();
$phys_path = str_replace('\\', $slash, $phys_path);
$file_path_array = explode ($slash, $self);
$file_name = array_pop($file_path_array);
$cwd_path_array = explode ($slash, $phys_path);
$self_dir_path_array = array_values (array_intersect ($file_path_array, $cwd_path_array));
$self_dir_path = implode("/", $self_dir_path_array)."/";
$cwd_url = $url_root . '/' . $self_dir_path;
return($cwd_url . $url);