Clean up your curl – fixing relative links in scraped content

A few years ago I Added jQuery browser check for ODT and Skillport eLearning. This week I updated that functionality and noticed a relative link in some remote content I was scraping via cURL.
The problem being that the relative link needed to be fully qualified.

Here’s the old table that I cURL from the Skillport support site:
Screen Shot of the old compatibility table

Here’s the updated table:
a view of the table pulled in from skillport
The culprit is the “Click here to view Skillsoft’s Browser Support Statement” at the top of the table. “here” links to /44007.htm which isn’t helpful.

And here’s the key to fixing that:

<?php
  //Fix any base url issues
  $body = preg_replace("#( href| src)=('|")(&#91;^h/#.|^m&#93;)#",'$1=$2'.$baseUrl.'$3',$body);
&#91;/code&#93;
Prepend the remote base URL into any links that don't start with http or mailto in the remote content.

Ah, all's well that ends well.

Here's the full script:
<div class="tallcode"><?php
global $pageToCURL;
$baseUrl = "http://documentation.skillsoft.com/";
if(!$pageToCURL){
  $pageToCURL=$baseUrl . '21054.htm';
}
//cURL the page to copy
// create curl resource
$ch = curl_init();

// set url
curl_setopt($ch, CURLOPT_URL, str_replace("http://","",$pageToCURL));

//return the transfer as a string
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

// $output contains the output string
$output = curl_exec($ch);

// close curl resource to free up system resources
curl_close($ch);


$regex = '/.*Supported Browsers/';
//preg_match($regex,$store,$match);

$customContent=' <tr align="left" valign="top"> ';

$startAfter='';
// or
$startBefore='<td bgcolor="#C0C0C0" style="border:1px solid #010101;" width="336"><p class="tablebodytext"><strong class="specialbold">Supported Browsers</strong></p>';

$endBefore='<td bgcolor="#C0C0C0" style="border:1px solid #010101;" width="336"><p class="tablebodytext"><strong class="specialbold">Recommended Browser Settings </strong></p>';
$body="";
if($startAfter!=""){
  $bodyOpenPos = strpos($output,$startAfter)+strlen($startAfter)-strlen($customContent);
} else {
  $bodyOpenPos = strpos($output,$startBefore)-strlen($customContent);
}
$bodyClosePos=strpos($output,$endBefore)-strlen($customContent);
$bodyLen=(strlen($output)-(strlen($output)-$bodyClosePos)-$bodyOpenPos);
$body = substr($output,$bodyOpenPos,$bodyLen);
if(strlen($body) > 0){
  //$body = basePathHref($body);
  //Fix any base url issues
  $body = preg_replace("#( href| src)=('|")([^h/#.|^m])#",'$1=$2'.$baseUrl.'$3',$body);
?>
From the <span title="<?php echo $pageToCURL;?>
">Skillport website</span>:
<table border="0" cellpadding="0" cellspacing="0" class="table">
  <?php echo $body;?>
</table>
<?php
} //end if ?>