View
1.530
Download
0
Category
Tags:
Preview:
DESCRIPTION
Presentation held at London XQuery Meetup in September 2011. In general, it shows how Web Scraping has naturally evolved towards XQuery. Additionally, it discusses different obstacles in scraping websites. A live example is shown as proof of solving these problems using XQuery.
Citation preview
XQuery: Querying the World(formerly known as Web Scraping)
Dennis Knochenwefel <dennis.knochenwefel@28msec.com>
EvolutionWeb Scraping
$url = "http://www.nfl.com/teams/sandiegochargers/roster?team=SD";$raw = file_get_contents($url);$newlines = array("\t","\n","\r","\x20\x20","\0","\x0B");$content = str_replace($newlines, "", html_entity_decode($raw));$start = strpos($content,'<table cellpadding="2" class="standard_table"');$end = strpos($content,'</table>',$start) + 8;
$table = substr($content,$start,$end-$start);
preg_match_all("|<tr(.*)</tr>|U",$table,$rows);foreach ($rows[0] as $row){
if ((strpos($row,'<th')===false)){ preg_match_all("|<td(.*)</td>|U",$row,$cells); $number = strip_tags($cells[0][0]); $name = strip_tags($cells[0][1]); $position = strip_tags($cells[0][2]); echo "{$position} - {$name} - Number {$number} <br>\n"; }}
source: http://www.bradino.com/php/screen-scraping/
PHP (2007)
$url = "http://www.nfl.com/teams/sandiegochargers/roster?team=SD";
$raw = file_get_contents($url);
$newlines = array("\t","\n","\r","\x20\x20","\0","\x0B");
$content = str_replace($newlines, "", html_entity_decode($raw));
$start = strpos($content,'<table cellpadding="2" class="standard_table"');
$end = strpos($content,'</table>',$start) + 8;
$table = substr($content,$start,$end-$start);
preg_match_all("|<tr(.*)</tr>|U",$table,$rows);foreach ($rows[0] as $row){
if ((strpos($row,'<th')===false)){
preg_match_all("|<td(.*)</td>|U",$row,$cells);
$number = strip_tags($cells[0][0]);
$name = strip_tags($cells[0][1]);
$position = strip_tags($cells[0][2]);
echo "{$position} - {$name} - Number {$number} <br>\n";
}
}
$url="http://www.rtu.ac.in/results/reformat.php";
$post="rollnumber=08epccs060&filename=fetchmodulesem_4_btech410m.php&button=Submit";
$ch=curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_POST,1);
curl_setopt($ch,CURLOPT_POSTFIELDS,$post);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
$content=curl_exec($ch);
curl_close($ch);
$totalPath="html/body/table[4]/tbody/tr[3]/td[4]";
$page=new DOMDocument();
$xpath=new DOMXPath($page);
$page->loadHTML($content);
$page->saveHTML(); // this shows the page contents
$total=$xpath->query($totalPath);
echo $total->length; //shows 0
echo $total->item(0)->nodeValue; //shows nothing
source: http://stackoverflow.com/questions/6283361/unable-to-get-table-data-from-a-html-page
PHP (June 2011)
$url="http://www.rtu.ac.in/results/reformat.php";
$post="rollnumber=08epccs060&filename=fetchmodulesem_4_btech410m.php&button=Submit";
$ch=curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_POST,1);
curl_setopt($ch,CURLOPT_POSTFIELDS,$post);
curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
$content=curl_exec($ch);
curl_close($ch);
$totalPath="html/body/table[4]/tbody/tr[3]/td[4]";
$page=new DOMDocument();
$xpath=new DOMXPath($page);
$page->loadHTML($content);
$page->saveHTML(); // this shows the page contents
$total=$xpath->query($totalPath);
echo $total->length; //shows 0
echo $total->item(0)->nodeValue; //shows nothing
!
!
XQuery
Real WorldExample
awesome site
awesome data
no API
Deal with sessions
Need to emulate setting options
Different NotionsPublisher <=> Consumer
Website App
CSV !HTML !XLS !Zip !
JSON ?XML ?
Website App
CSV !HTML !XLS !Zip !
JSON ?XML ?
Session!
Stateless
REST
API ?
Website App
CSV !HTML !XLS !Zip !
JSON ?XML ?
Session!
Stateless
REST
API ?
Customize with URL Params
HTML Forms
Website App
CSV !HTML !XLS !Zip !
JSON ?XML ?
Session!
Stateless
REST
API ?
Customize with URL Params
HTML Forms
Website App
CSV !HTML !XLS !Zip !Session!
HTML Forms
HTML !
Session!
HTML Forms
XQuery !
Summary
XQuery Web Data Processing
A browser can do it?
XQuery can do it!
Session handling
Forms
!
!
Result:http://www.unemployment.by/country
Recommended