diff --git a/.gitignore b/.gitignore index 485dee6..1f746d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ .idea +nextcheck.dat +config.php diff --git a/config.php b/config.php new file mode 100644 index 0000000..16ce7b1 --- /dev/null +++ b/config.php @@ -0,0 +1,8 @@ + 'localhost', + 'user' => '', + 'pass' => '', + 'db' => '', + 'api_key' => '' +); diff --git a/install.php b/install.php index c7156c2..7060872 100644 --- a/install.php +++ b/install.php @@ -1,69 +1,25 @@ -usinglists = array('googpub-phish-shavar','goog-malware-shavar'); -//Install MySQL tables -foreach($phpgsb->usinglists as $value) - { - //Create ADD tables - mysql_query("CREATE TABLE IF NOT EXISTS `$value-a-hosts` ( - `ID` int(255) NOT NULL auto_increment, - `Hostkey` varchar(8) NOT NULL, - `Chunknum` int(255) NOT NULL, - `Count` varchar(2) NOT NULL default '0', - `FullHash` varchar(70) NOT NULL, - PRIMARY KEY (`ID`), - KEY `Hostkey` (`Hostkey`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;"); - mysql_query("CREATE TABLE IF NOT EXISTS `$value-a-index` ( - `ChunkNum` int(255) NOT NULL auto_increment, - `Chunklen` int(255) NOT NULL default '0', - PRIMARY KEY (`ChunkNum`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;"); - mysql_query("CREATE TABLE IF NOT EXISTS `$value-a-prefixes` ( - `ID` int(255) NOT NULL auto_increment, - `Hostkey` varchar(8) NOT NULL, - `Prefix` varchar(255) NOT NULL, - `FullHash` varchar(70) NOT NULL, - PRIMARY KEY (`ID`), - KEY `Hostkey` (`Hostkey`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;"); - //Create SUB tables - mysql_query("CREATE TABLE IF NOT EXISTS `$value-s-hosts` ( - `ID` int(255) NOT NULL auto_increment, - `Hostkey` varchar(8) NOT NULL, - `Chunknum` int(255) NOT NULL, - `Count` varchar(2) NOT NULL default '0', - `FullHash` varchar(70) NOT NULL, - PRIMARY KEY (`ID`), - KEY `Hostkey` (`Hostkey`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;"); - mysql_query("CREATE TABLE IF NOT EXISTS `$value-s-index` ( - `ChunkNum` int(255) NOT NULL auto_increment, - `Chunklen` int(255) NOT NULL default '0', - PRIMARY KEY (`ChunkNum`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;"); - mysql_query("CREATE TABLE IF NOT EXISTS `$value-s-prefixes` ( - `ID` int(255) NOT NULL auto_increment, - `Hostkey` varchar(8) NOT NULL, - `AddChunkNum` varchar(8) NOT NULL, - `Prefix` varchar(255) NOT NULL, - `FullHash` varchar(70) NOT NULL, - PRIMARY KEY (`ID`), - KEY `Hostkey` (`Hostkey`) -) ENGINE=InnoDB DEFAULT CHARSET=latin1 ;"); - } -//Check timeout files writable -if(file_put_contents("testfile.dat","TEST PRE-USE PHPGSB ".time())) - unlink("testfile.dat"); -else - echo "ERROR: THIS DIRECTORY IS NOT WRITABLE, CHMOD to 775 or 777"; -?> \ No newline at end of file +usinglists = array( + 'googpub-phish-shavar', + 'goog-malware-shavar' +); + +$phpgsb->install(); + +//Check timeout files writable +if (file_put_contents("testfile.dat", "TEST PRE-USE PHPGSB " . time())) { + unlink("testfile.dat"); +} else { + echo "DIRECTORY IS NOT WRITABLE, CHMOD to 775 or 777"; +} diff --git a/listupdater.php b/listupdater.php index a6d9425..41c216c 100644 --- a/listupdater.php +++ b/listupdater.php @@ -1,17 +1,17 @@ -apikey = "API_KEY_HERE"; -$phpgsb->usinglists = array('googpub-phish-shavar','goog-malware-shavar'); -$phpgsb->runUpdate(); -$phpgsb->close(); -?> \ No newline at end of file +apikey = $config['api_key']; +$phpgsb->usinglists = array('googpub-phish-shavar','goog-malware-shavar'); +$phpgsb->runUpdate(); diff --git a/lookup.php b/lookup.php index 8313127..805d815 100644 --- a/lookup.php +++ b/lookup.php @@ -1,20 +1,21 @@ -apikey = "API_KEY_HERE"; -$phpgsb->usinglists = array('googpub-phish-shavar','goog-malware-shavar'); -//Should return false (not phishing or malware) -var_dump($phpgsb->doLookup('http://www.google.com')); -//Should return true, malicious URL -var_dump($phpgsb->doLookup('http://www.gumblar.cn')); -$phpgsb->close(); -?> \ No newline at end of file +apikey = $config['api_key']; + +$phpgsb->usinglists = array('googpub-phish-shavar','goog-malware-shavar'); +// Should return false (not phishing or malware) +var_dump($phpgsb->doLookup('http://www.google.com')); +// Should return true, malicious URL +var_dump($phpgsb->doLookup('http://www.gumblar.cn')); diff --git a/phpgsb.class.php b/phpgsb.class.php index 347026c..2f121a7 100644 --- a/phpgsb.class.php +++ b/phpgsb.class.php @@ -1,1404 +1,1551 @@ -silent(); - $this->outputmsg("phpGSB Loaded"); - if($database&&$username) - $this->dbConnect($database,$username,$password,$host); - } - function close() - { - mysql_close(); - $this->outputmsg("Closing phpGSB. (Peak Memory: ".(round(memory_get_peak_usage()/1048576,3))."MB)"); - } - function silent() - { - $this->verbose = false; - } - function trans_disable() - { - $this->transenabled = false; - } - function trans_enable() - { - $this->transenabled = true; - } - function trans_begin() - { - if($this->transenabled) - { - $this->transtarted = true; - $this->outputmsg("Begin MySQL Transaction"); - mysql_query("BEGIN"); - } - } - function trans_commit() - { - if($this->transtarted&&mysql_ping()&&$this->transenabled) - { - $this->transtarted = false; - $this->outputmsg("Comitting Transaction"); - mysql_query("COMMIT"); - } - } - function trans_rollback() - { - if($this->transtarted&&mysql_ping()&&$this->transenabled) - { - $this->transtarted = false; - $this->outputmsg("Rolling Back Transaction"); - mysql_query("ROLLBACK"); - } - } - /*Function to output messages, used instead of echo, - will make it easier to have a verbose switch in later - releases*/ - function outputmsg($msg) - { - if($this->verbose) - { - echo $msg.'...
'; - $this->ob .= ob_get_contents(); - ob_flush(); - } - } - /*Function to output errors, used instead of echo, - will make it easier to have a verbose switch in later - releases*/ - function fatalerror($msg) - { - if($this->verbose) - { - print_r($msg); - echo '...
'; - $this->ob .= ob_get_contents(); - ob_end_flush(); - } - $this->trans_rollback(); - die(); - } - /*Wrapper to connect to database. Simples.*/ - function dbConnect($database,$username,$password,$host="localhost") - { - $link = mysql_connect($host, $username, $password); - if (!$link) { - $this->fatalerror('Could not connect: ' . mysql_error()); - } - $this->outputmsg('Connected successfully to database server'); - $db_selected = mysql_select_db($database, $link); - if (!$db_selected) { - $this->fatalerror('Can\'t use $database : ' . mysql_error()); - } - $this->outputmsg('Connected to database successfully'); - } - /*Simple logic function to calculate timeout - based on the number of previous errors*/ - function calc($errors) - { - //According to Developer Guide Formula - if($errors==1) - { - //According to Developer Guide (1st error, wait a minute) - return 60; - } - elseif($errors>5) - { - //According to Developer Guide (Above 5 errors check every 4 hours) - return 28800; - } - else - { - //According to Developer Guide we simply double up our timeout each time and use formula: - //(Adapted to be relative to errors) ( ((2^$errors) * 7.5) * (decimalrand(0,1) + 1)) to produce - // a result between: 120min-240min for example - return floor((pow(2,$errors) * 7.5) * ((rand(0,1000)/1000) + 1)); - } - } - /*Writes backoff timeouts, uses calc() to - calculate timeouts and then writes to file - for next check*/ - function Backoff($errdata=false,$type) - { - if($type=="data") - $file = 'nextcheck.dat'; - else - $file = 'nextcheckl.dat'; - $curstatus = explode('||',file_get_contents($this->pingfilepath.$file)); - $curstatus[1] = $curstatus[1] + 1; - $seconds = $this->calc($curstatus[1]); - $until = time()+$seconds.'||'.$curstatus[1]; - file_put_contents($this->pingfilepath.$file,$until); - $this->fatalerror(array("Invalid Response... Backing Off",$errdata)); - } - /*Writes timeout from valid requests to nextcheck file*/ - function setTimeout($seconds) - { - if (file_exists($this->pingfilepath.'nextcheck.dat')) { - $curstatus = explode('||',@file_get_contents($this->pingfilepath.'nextcheck.dat')); - $until = time()+$seconds.'||'.$curstatus[1]; - } else { - $until = time()+$seconds.'||'; - } - file_put_contents($this->pingfilepath.'nextcheck.dat',$until); - } - /*Checks timeout in timeout files (usually performed at the - start of script)*/ - function checkTimeout($type) - { - if($type=="data") - $file = 'nextcheck.dat'; - else - $file = 'nextcheckl.dat'; - $curstatus = explode('||',file_get_contents($this->pingfilepath.$file)); - if(time()<$curstatus[0]) - { - $this->fatalerror("Must wait another ".($curstatus[0]-time()). " seconds before another request"); - } - else - $this->outputmsg("Allowed to request"); - } - /*Function downloads from URL's, POST data can be - passed via $options. $followbackoff indicates - whether to follow backoff procedures or not*/ - function googleDownloader($url,$options,$followbackoff=false) - { - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HEADER, 0); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - - if(is_array($options)) - curl_setopt_array($ch, $options); - - $data = curl_exec($ch); - $info = curl_getinfo($ch); - curl_close($ch); - if($followbackoff&&$info['http_code']>299) - { - $this->Backoff($info,$followbackoff); - } - return array($info,$data); - } - //UPDATER FUNCTIONS - /*Resets lists database, only called if GSB issues r:resetdatabase*/ - function resetDatabase() - { - //Lord knows why they would EVER issue this request! - if(!empty($this->adminemail)) - mail($this->adminemail,'Reset Database Request Issued','For some crazy unknown reason GSB requested a database reset at '.time()); - foreach($this->usinglists as $value) - { - mysql_query("TRUNCATE TABLE `$value-s-index`"); - mysql_query("TRUNCATE TABLE `$value-s-hosts`"); - mysql_query("TRUNCATE TABLE `$value-s-prefixes`"); - mysql_query("TRUNCATE TABLE `$value-a-index`"); - mysql_query("TRUNCATE TABLE `$value-a-hosts`"); - mysql_query("TRUNCATE TABLE `$value-a-prefixes`"); - } - } - /*Processes data recieved from a GSB data request into a managable array*/ - function processChunks($fulldata,$listname) - { - $subarray = array(); - $addarray = array(); - $loaddata = trim($fulldata); - $clonedata = $loaddata; - while(strlen($clonedata)>0) - { - $splithead = explode("\n",$clonedata,2); - $chunkinfo = explode(':',$splithead[0]); - $type = $chunkinfo[0]; - $chunknum = $chunkinfo[1]; - $hashlen = $chunkinfo[2]; - $chunklen = $chunkinfo[3]; - if($chunklen>0) - { - $tmparray = array(); - //Convert to hex for easy processing - //First get chunkdata according to length - $chunkdata = bin2hex(substr($splithead[1],0,$chunklen)); - if($type=='a') - { - $maini = 0; - while(strlen($chunkdata)>0) - { - $tmparray[$maini]['HOSTKEY'] = substr($chunkdata, 0, 8); - $tmparray[$maini]['COUNT'] = substr($chunkdata, 8, 2); - $chunkdata = substr($chunkdata,10); - $realcount = hexdec($tmparray[$maini]['COUNT']); - if($realcount>0) - { - for ($i = 0; $i < $realcount; $i++) { - $tmparray[$maini]['PAIRS'][$i]['PREFIX'] = substr($chunkdata, 0, ($hashlen*2)); - $chunkdata = substr($chunkdata,(($hashlen*2))); - } - } - elseif($realcount<0) - { - $this->fatalerror(array("Decoding Error, Somethings gone wrong!",$tmparray[$maini])); - } - $maini++; - } - $addarray['CHUNKNUM'] = $chunknum; - $addarray['HASHLEN'] = $hashlen; - $addarray['CHUNKLEN'] = $chunklen; - $addarray['REAL'] = $tmparray; - $this->saveChunkPart($addarray,"ADD",$listname); - unset($addarray); - } - elseif($type=='s') - { - $maini = 0; - while(strlen($chunkdata)>0) - { - $tmparray[$maini]['HOSTKEY'] = substr($chunkdata, 0, 8); - $tmparray[$maini]['COUNT'] = substr($chunkdata, 8, 2); - $chunkdata = substr($chunkdata,10); - $realcount = hexdec($tmparray[$maini]['COUNT']); - if($realcount>0) - { - for ($i = 0; $i < $realcount; $i++) { - $tmparray[$maini]['PAIRS'][$i]['ADDCHUNKNUM'] = substr($chunkdata, 0, 8); - $tmparray[$maini]['PAIRS'][$i]['PREFIX'] = substr($chunkdata, 8, ($hashlen*2)); - $chunkdata = substr($chunkdata,(($hashlen*2)+8)); - } - } - elseif($realcount==0) - { - $tmparray[$maini]['PAIRS'][0]['ADDCHUNKNUM'] = substr($chunkdata, 0, 8); - $chunkdata = substr($chunkdata, 8); - } - else - { - $this->fatalerror(array("Decoding Error, Somethings gone wrong!",$tmparray[$maini])); - } - $maini++; - } - $subarray['CHUNKNUM'] = $chunknum; - $subarray['HASHLEN'] = $hashlen; - $subarray['CHUNKLEN'] = $chunklen; - $subarray['REAL'] = $tmparray; - $this->saveChunkPart($subarray,"SUB",$listname); - unset($subarray); - } - else - { - $this->outputmsg("DISCARDED CHUNKNUM: $chunknum (Had no valid label)"); - } - } - else - { - //No ChunkData, Still Insert - if($type=='a') - { - $addarray['CHUNKNUM'] = $chunknum; - $addarray['HASHLEN'] = $hashlen; - $addarray['CHUNKLEN'] = $chunklen; - $this->saveChunkPart($addarray,"ADD",$listname); - unset($addarray); - } - elseif($type=='s') - { - $subarray['CHUNKNUM'] = $chunknum; - $subarray['HASHLEN'] = $hashlen; - $subarray['CHUNKLEN'] = $chunklen; - $this->saveChunkPart($subarray,"SUB",$listname); - unset($subarray); - } - else - { - $this->outputmsg("DISCARDED CHUNKNUM: $chunknum (Empty)"); - } - } - $clonedata = substr($splithead[1],$chunklen); - } - return true; - } - /*Saves processed data to the MySQL database*/ - function saveChunkPart($data,$type,$listname) - { - $listname = trim($listname); - //Check what type of data it is... - $buildindex = array(); - $buildhost = array(); - $buildpairs = array(); - if($type=="SUB") - { - $value = $data; - if(!isset($this->mainlist['s'][$listname][$value['CHUNKNUM']])) - { - $this->mainlist['s'][$listname][$value['CHUNKNUM']] = true; - $buildindex[] = "('{$value['CHUNKNUM']}','{$value['CHUNKLEN']}')"; - if($value['CHUNKLEN']>0) - { - foreach($value['REAL'] as $newkey=>$newvalue) - { - $buildhost[] = "('{$newvalue['HOSTKEY']}','{$value['CHUNKNUM']}','{$newvalue['COUNT']}','')"; - if(isset($newvalue['PAIRS'])&&count($newvalue['PAIRS'])>0) - { - foreach($newvalue['PAIRS'] as $innerkey=>$innervalue) - { - if( isset($innervalue['PREFIX']) ) { - $buildpairs[] = "('{$newvalue['HOSTKEY']}','{$innervalue['ADDCHUNKNUM']}','{$innervalue['PREFIX']}','')"; - } else { - $buildpairs[] = "('{$newvalue['HOSTKEY']}','{$innervalue['ADDCHUNKNUM']}','','')"; - } - } - } - } - } - } - } - else if($type=="ADD") - { - //Then lets insert add data - $value = $data; - if(!isset($this->mainlist['a'][$listname][$value['CHUNKNUM']])) - { - $this->mainlist['a'][$listname][$value['CHUNKNUM']] = true; - $buildindex[] = "('{$value['CHUNKNUM']}','{$value['CHUNKLEN']}')"; - if($value['CHUNKLEN']>0) - { - foreach($value['REAL'] as $newkey=>$newvalue) - { - $buildhost[] = "('{$newvalue['HOSTKEY']}','{$value['CHUNKNUM']}','{$newvalue['COUNT']}','')"; - if(isset($newvalue['PAIRS'])&&count($newvalue['PAIRS'])>0) - { - foreach($newvalue['PAIRS'] as $innerkey=>$innervalue) - { - if( isset($innervalue['PREFIX']) ) { - $buildpairs[] = "('{$newvalue['HOSTKEY']}','{$innervalue['PREFIX']}','')"; - } else { - $buildpairs[] = "('{$newvalue['HOSTKEY']}','','')"; - } - } - } - } - } - } - } - if(count($buildindex)>0) - { - if($type=="ADD") - $listtype = 'a'; - elseif($type=="SUB") - $listtype = 's'; - //Insert index value - $indexinsert = implode(', ',$buildindex); - $indexins = mysql_query("INSERT INTO `$listname-$listtype-index` (`ChunkNum`,`Chunklen`) VALUES $indexinsert;"); - $error = mysql_error(); - if($indexins) - { - if(count($buildhost)>0) - { - //Insert hostkeys index - $hostinsert = implode(', ',$buildhost); - mysql_query("INSERT INTO `$listname-$listtype-hosts` (`Hostkey`,`Chunknum`,`Count`,`FullHash`) VALUES $hostinsert;"); - $error = mysql_error(); - if(!empty($error)) - $this->outputmsg("INSERTED $listname $type HOST KEYS ".mysql_error()); - } - if(count($buildpairs)>0) - { - //Insert prefixes - $pairinsert = implode(', ',$buildpairs); - if($type=="ADD") - mysql_query("INSERT INTO `$listname-$listtype-prefixes` (`Hostkey`,`Prefix`,`FullHash`) VALUES $pairinsert;"); - elseif($type=="SUB") - mysql_query("INSERT INTO `$listname-$listtype-prefixes` (`Hostkey`,`AddChunkNum`,`Prefix`,`FullHash`) VALUES $pairinsert;"); - $error = mysql_error(); - if(!empty($error)) - $this->outputmsg("INSERTED $listname $type PREFIX HOST KEYS ".mysql_error()); - } - } - elseif(!empty($error)) - $this->outputmsg("COULD NOT SAVE $listname $type INDEXS ".mysql_error()); - } - } - /*Get ranges of existing chunks from a requested list - and type (add [a] or sub [s] return them and set - mainlist to recieved for that chunk (prevent dupes)*/ - function getRanges($listname,$mode) - { - $checktable = $listname.'-'.$mode.'-index'; - $results = mysql_query("SELECT ChunkNum FROM `$checktable` ORDER BY `ChunkNum` ASC"); - $ranges = array(); - $i = 0; - $start = 0; - while ($row = mysql_fetch_array($results, MYSQL_BOTH)) - { - $this->mainlist[$mode][$listname][$row['ChunkNum']] = true; - if($i==0) - { - $start = $row['ChunkNum']; - $previous = $row['ChunkNum']; - } - else - { - $expected = $previous + 1; - if($row['ChunkNum']!=$expected) - { - if($start==$previous) - $ranges[] = $start; - else - $ranges[] = $start.'-'.$previous; - $start = $row['ChunkNum']; - } - $previous = $row['ChunkNum']; - } - $i++; - } - if($start>0&&$previous>0) - { - if($start==$previous) - $ranges[] = $start; - else - $ranges[] = $start.'-'.$previous; - } - return $ranges; - } - /*Get both add and sub ranges for a requested list*/ - function getFullRanges($listname) - { - $subranges = $this->getRanges($listname,'s'); - $addranges = $this->getRanges($listname,'a'); - return array("Subranges"=>$subranges,"Addranges"=>$addranges); - } - /*Format a full request body for a desired list including - name and full ranges for add and sub*/ - function formattedRequest($listname) - { - $fullranges = $this->getFullRanges($listname); - $buildpart = ''; - if(count($fullranges['Subranges'])>0) - $buildpart .= 's:'.implode(',',$fullranges['Subranges']); - if(count($fullranges['Subranges'])>0&&count($fullranges['Addranges'])>0) - $buildpart .= ':'; - if(count($fullranges['Addranges'])>0) - $buildpart .= 'a:'.implode(',',$fullranges['Addranges']); - return $listname.';'.$buildpart."\n"; - } - /*Called when GSB returns a SUB-DEL or ADD-DEL response*/ - function deleteRange($range,$mode,$listname) - { - $buildtrunk = $listname.'-'.$mode; - if(substr_count($range,'-')>0) - { - $deleterange = explode('-',trim($range)); - $clause = "`ChunkNum` >= '{$deleterange[0]}' AND `ChunkNum` <= '{$deleterange[1]}'"; - } - else - $clause = "`ChunkNum` = '$range'"; - //Delete from index - mysql_query("DELETE FROM `$buildtrunk-index` WHERE $clause"); - - //Select all host keys that match chunks (we'll delete them after but we need the hostkeys list!) - $result = mysql_query("SELECT `Hostkey` FROM `$buildtrunk-hosts` WHERE $clause"); - $buildprefixdel = array(); - if($result&&mysql_num_rows($result)>0) - { - while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - if(!empty($row['Hostkey'])) - $buildprefixdel[] = $row['Hostkey']; - } - if (count($buildprefixdel)) { - //Delete all matching hostkey prefixes - mysql_query( - "DELETE FROM `$buildtrunk-prefixes` WHERE `Hostkey` in ('" . implode('\',\'', $buildprefixdel) . "')" - ); - } - - //Delete all matching hostkeys - mysql_query("DELETE FROM `$buildtrunk-hosts` WHERE $clause"); - } - } - /*Main part of updater function, will call all other functions, merely requires - the request body, it will then process and save all data as well as checking - for ADD-DEL and SUB-DEL, runs silently so won't return anything on success*/ - function getData($body) - { - if(empty($body)) - $this->fatalerror("Missing a body for data request"); - $this->trans_begin(); - $buildopts = array(CURLOPT_POST=>true,CURLOPT_POSTFIELDS=>$body."\n"); - $result = $this->googleDownloader("http://safebrowsing.clients.google.com/safebrowsing/downloads?client=api&apikey=".$this->apikey."&appver=".$this->version."&pver=".$this->apiversion,$buildopts,"data"); - preg_match('/^n:(.*)$/m', $result[1], $match); - $timeout = $match[1]; - $this->setTimeout($timeout); - if(substr_count($result[1],'r:pleasereset')>0) - $this->resetDatabase(); - else - { - $formattedlist = array(); - if(substr_count($result[1],'i:')>0) - { - $splitlists = explode('i:',$result[1]); - unset($splitlists[0]); - foreach($splitlists as $key=>$value) - { - $listdata = explode("\n",trim($value)); - $listname = $listdata[0]; - unset($listdata[0]); - $formattedlist[$listname] = $listdata; - } - foreach($formattedlist as $key=>$value) - { - $listname = $key; - foreach($value as $keyinner=>$valueinner) - { - if(substr_count($valueinner,"u:")>0) - { - $chunkdata = $this->googleDownloader('http://'.trim(str_replace('u:','',$valueinner)),false,"data"); - $processed = $this->processChunks($chunkdata[1],$listname); - $this->outputmsg("Saved a chunk file"); - } - elseif(substr_count($valueinner,"ad:")>0) - { - if(substr_count($valueinner,',')>0) - { - $valueinner = explode(',',trim(str_replace("ad:","",$valueinner))); - foreach($valueinner as $keyadd=>$valueadd) - { - $this->deleteRange($valueadd,'a',$listname); - } - } - else - $this->deleteRange(trim(str_replace("ad:","",$valueinner)),'a',$listname); - } - elseif(substr_count($valueinner,"sd:")>0) - { - if(substr_count($valueinner,',')>0) - { - $valueinner = explode(',',trim(str_replace("sd:","",$valueinner))); - foreach($valueinner as $keyadd=>$valueadd) - { - $this->deleteRange($valueadd,'s',$listname); - } - } - else - $this->deleteRange(trim(str_replace("sd:","",$valueinner)),'s',$listname); - } - } - - } - } - else - { - $this->outputmsg('No data available in list'); - } - } - $this->trans_commit(); - return true; - } - /*Shortcut to run updater*/ - function runUpdate() - { - $this->checkTimeout('data'); - $require = ""; - foreach($this->usinglists as $value) - $require .= $this->formattedRequest($value); - $this->outputmsg("Using $require"); - $this->getData($require); - } - //LOOKUP FUNCTIONS - /*Used to check the canonicalize function*/ - function validateMethod() - { - //Input => Expected - $cases = array( - "http://host/%25%32%35" => "http://host/%25", - "http://host/%25%32%35%25%32%35" => "http://host/%25%25", - "http://host/%2525252525252525" => "http://host/%25", - "http://host/asdf%25%32%35asd" => "http://host/asdf%25asd", - "http://host/%%%25%32%35asd%%" => "http://host/%25%25%25asd%25%25", - "http://www.google.com/" => "http://www.google.com/", - "http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/" => "http://168.188.99.26/.secure/www.ebay.com/", - "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/" => "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", - "http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B" => 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+', - "http://3279880203/blah" => "http://195.127.0.11/blah", - "http://www.google.com/blah/.." => "http://www.google.com/", - "www.google.com/" => "http://www.google.com/", - "www.google.com" => "http://www.google.com/", - "http://www.evil.com/blah#frag" => "http://www.evil.com/blah", - "http://www.GOOgle.com/" => "http://www.google.com/", - "http://www.google.com.../" => "http://www.google.com/", - "http://www.google.com/foo\tbar\rbaz\n2" => "http://www.google.com/foobarbaz2", - "http://www.google.com/q?" => "http://www.google.com/q?", - "http://www.google.com/q?r?" => "http://www.google.com/q?r?", - "http://www.google.com/q?r?s" => "http://www.google.com/q?r?s", - "http://evil.com/foo#bar#baz" => "http://evil.com/foo", - "http://evil.com/foo;" => "http://evil.com/foo;", - "http://evil.com/foo?bar;" => "http://evil.com/foo?bar;", - "http://\x01\x80.com/" => "http://%01%80.com/", - "http://notrailingslash.com" => "http://notrailingslash.com/", - "http://www.gotaport.com:1234/" => "http://www.gotaport.com:1234/", - " http://www.google.com/ " => "http://www.google.com/", - "http:// leadingspace.com/" => "http://%20leadingspace.com/", - "http://%20leadingspace.com/" => "http://%20leadingspace.com/", - "%20leadingspace.com/" => "http://%20leadingspace.com/", - "https://www.securesite.com/" => "https://www.securesite.com/", - "http://host.com/ab%23cd" => "http://host.com/ab%23cd", - "http://host.com//twoslashes?more//slashes" => "http://host.com/twoslashes?more//slashes" - ); - foreach($cases as $key=>$value) - { - $canit = $this->Canonicalize($key); - $canit = $canit['GSBURL']; - if($canit==$value) - outputmsg("PASSED: $key"); - else - outputmsg("INVALID:
ORIGINAL: $key
EXPECTED: $value
RECIEVED: $canit
"); - - } - } - /*Special thanks Steven Levithan (stevenlevithan.com) for the ridiculously complicated regex - required to parse urls. This is used over parse_url as it robustly provides access to - port, userinfo etc and handles mangled urls very well. - Expertly integrated into phpGSB by Sam Cleaver ;) - Thanks to mikegillis677 for finding the seg. fault issue in the old function. - Passed validateMethod() check on 17/01/12*/ - function j_parseUrl($url) - { - $strict = '/^(?:([^:\/?#]+):)?(?:\/\/\/?((?:(([^:@]*):?([^:@]*))?@)?([^:\/?#]*)(?::(\d*))?))?(((?:\/(\w:))?((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/'; - $loose = '/^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/\/?)?((?:(([^:@]*):?([^:@]*))?@)?([^:\/?#]*)(?::(\d*))?)(((?:\/(\w:))?(\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/'; - preg_match($loose, $url, $match); - if(empty($match)) - { - //As odd as its sounds, we'll fall back to strict (as technically its more correct and so may salvage completely mangled urls) - unset($match); - preg_match($strict, $url, $match); - } - $parts = array("source"=>'',"scheme"=>'',"authority"=>'',"userinfo"=>'',"user"=>'',"password"=>'',"host"=>'',"port"=>'',"relative"=>'',"path"=>'',"drive"=>'',"directory"=>'',"file"=>'',"query"=>'',"fragment"=>''); - switch (count ($match)) { - case 15: $parts['fragment'] = $match[14]; - case 14: $parts['query'] = $match[13]; - case 13: $parts['file'] = $match[12]; - case 12: $parts['directory'] = $match[11]; - case 11: $parts['drive'] = $match[10]; - case 10: $parts['path'] = $match[9]; - case 9: $parts['relative'] = $match[8]; - case 8: $parts['port'] = $match[7]; - case 7: $parts['host'] = $match[6]; - case 6: $parts['password'] = $match[5]; - case 5: $parts['user'] = $match[4]; - case 4: $parts['userinfo'] = $match[3]; - case 3: $parts['authority'] = $match[2]; - case 2: $parts['scheme'] = $match[1]; - case 1: $parts['source'] = $match[0]; - } - return $parts; - } - /*Regex to check if its a numerical IP address*/ - function is_ip($ip) - { - return preg_match("/^([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" . - "(\.([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}$/", $ip); - } - /*Checks if input is in hex format*/ - function is_hex($x) - { - //Relys on the fact that hex often includes letters meaning PHP will disregard the string - if(($x+3) == 3) - return dechex(hexdec($x)) == $x; - return false; - } - /*Checks if input is in octal format*/ - function is_octal($x) - { - //Relys on the fact that in IP addressing octals must begin with a 0 to denote octal - return substr($x,0,1) == 0; - } - /*Converts hex or octal input into decimal */ - function hexoct2dec($value) - { - //As this deals with parts in IP's we can be more exclusive - if(substr_count(substr($value,0,2),'0x')>0&&$this->is_hex($value)) - { - return hexdec($value); - } - elseif($this->is_octal($value)) - { - return octdec($value); - } - else - return false; - } - /*Converts IP address part in HEX to decimal*/ - function iphexdec($hex) - { - //Removes any leading 0x (used to denote hex) and then and leading 0's) - $temp = str_replace('0x','',$hex); - $temp = ltrim($temp,"0"); - return hexdec($temp); - } - /*Converts full IP address in HEX to decimal*/ - function hexIPtoIP($hex) - { - //Remove hex identifier and leading 0's (not significant) - $tempip = str_replace('0x','',$hex); - $tempip = ltrim($tempip,"0"); - //It might be hex - if($this->is_hex($tempip)) - { - //There may be a load of junk before the part we need - if(strlen($tempip)>8) - { - $tempip = substr($tempip,-8); - } - $hexplode = preg_split('//', $tempip, -1, PREG_SPLIT_NO_EMPTY); - while(count($hexplode)<8) - array_unshift($hexplode,0); - //Normalise - $newip = hexdec($hexplode[0].$hexplode[1]).'.'.hexdec($hexplode[2].$hexplode[3]).'.'.hexdec($hexplode[4].$hexplode[5]).'.'.hexdec($hexplode[6].$hexplode[7]); - //Now check if its an IP - if($this->is_ip($newip)) - return $newip; - else - return false; - } - else - return false; - } - /*Checks if an IP provided in either hex, octal or decimal is in fact - an IP address. Normalises to a four part IP address.*/ - function isValid_IP($ip) - { - //First do a simple check, if it passes this no more needs to be done - if($this->is_ip($ip)) - return $ip; - - //Its a toughy... eerm perhaps its all in hex? - $checkhex = $this->hexIPtoIP($ip); - if($checkhex) - return $checkhex; - - //If we're still here it wasn't hex... maybe a DWORD format? - $checkdword = $this->hexIPtoIP(dechex($ip)); - if($checkdword) - return $checkdword; - - //Nope... maybe in octal or a combination of standard, octal and hex?! - $ipcomponents = explode('.',$ip); - $ipcomponents[0] = $this->hexoct2dec($ipcomponents[0]); - if(count($ipcomponents)==2) - { - //The writers of the RFC docs certainly didn't think about the clients! This could be a DWORD mixed with an IP part - if($ipcomponents[0]<=255&&is_int($ipcomponents[0])&&is_int($ipcomponents[1])) - { - $threeparts = dechex($ipcomponents[1]); - $hexplode = preg_split('//', $threeparts, -1, PREG_SPLIT_NO_EMPTY); - if(count($hexplode)>4) - { - $newip = $ipcomponents[0].'.'.$this->iphexdec($hexplode[0].$hexplode[1]).'.'.$this->iphexdec($hexplode[2].$hexplode[3]).'.'.$this->iphexdec($hexplode[4].$hexplode[5]); - //Now check if its valid - if($this->is_ip($newip)) - return $newip; - } - } - } - $ipcomponents[1] = $this->hexoct2dec($ipcomponents[1]); - if(count($ipcomponents)==3) - { - //Guess what... it could also be a DWORD mixed with two IP parts! - if(($ipcomponents[0]<=255&&is_int($ipcomponents[0]))&&($ipcomponents[1]<=255&&is_int($ipcomponents[1]))&&is_int($ipcomponents[2])) - { - $twoparts = dechex($ipcomponents[2]); - $hexplode = preg_split('//', $twoparts, -1, PREG_SPLIT_NO_EMPTY); - if(count($hexplode)>3) - { - $newip = $ipcomponents[0].'.'.$ipcomponents[1].'.'.$this->iphexdec($hexplode[0].$hexplode[1]).'.'.$this->iphexdec($hexplode[2].$hexplode[3]); - //Now check if its valid - if($this->is_ip($newip)) - return $newip; - } - } - } - //If not it may be a combination of hex and octal - if(count($ipcomponents)>=4) - { - $tmpcomponents = array($ipcomponents[2],$ipcomponents[3]); - foreach($tmpcomponents as $key=>$value) - { - if(!$tmpcomponents[$key] = $this->hexoct2dec($value)) - return false; - } - - array_unshift($tmpcomponents,$ipcomponents[0],$ipcomponents[1]); - //Convert back to IP form - $newip = implode('.',$tmpcomponents); - - //Now check if its valid - if($this->is_ip($newip)) - return $newip; - } - - //Well its not an IP that we can recognise... theres only so much we can do! - return false; - } - /*Had to write another layer as built in PHP urlencode() escapes all non - alpha-numeric Google states to only urlencode if its below 32 or above - or equal to 127 (some of those are non alpha-numeric and so urlencode - on its own won't work).*/ - function flexURLEncode($url,$ignorehash=false) - { - //Had to write another layer as built in PHP urlencode() escapes all non alpha-numeric - //google states to only urlencode if its below 32 or above or equal to 127 (some of those - //are non alpha-numeric and so urlencode on its own won't work). - $urlchars = preg_split('//', $url, -1, PREG_SPLIT_NO_EMPTY); - if(count($urlchars)>0) - { - foreach($urlchars as $key=>$value) - { - - $ascii = ord($value); - if($ascii<=32||$ascii>=127||($value=='#'&&!$ignorehash)||$value=='%') - $urlchars[$key] = rawurlencode($value); - } - return implode('',$urlchars); - } - else - return $url; - } - /*Canonicalize a full URL according to Google's definition.*/ - function Canonicalize($url) - { - //Remove line feeds, return carriages, tabs, vertical tabs - $finalurl = trim(str_replace(array("\x09","\x0A","\x0D","\x0B"),'',$url)); - //URL Encode for easy extraction - $finalurl = $this->flexURLEncode($finalurl,true); - //Now extract hostname & path - $parts = $this->j_parseUrl($finalurl); - $hostname = $parts['host']; - $path = $parts['path']; - $query = $parts['query']; - $lasthost = ""; - $lastpath = ""; - $lastquery = ""; - //Remove all hex coding (loops max of 50 times to stop craziness but should never - //reach that) - for ($i = 0; $i < 50; $i++) { - $hostname = rawurldecode($hostname); - $path = rawurldecode($path); - $query = rawurldecode($query); - if($hostname==$lasthost&&$path==$lastpath&&$query==$lastquery) - break; - $lasthost = $hostname; - $lastpath = $path; - $lastquery = $query; - } - //Deal with hostname first - //Replace all leading and trailing dots - $hostname = trim($hostname,'.'); - //Replace all consecutive dots with one dot - $hostname = preg_replace("/\.{2,}/",".",$hostname); - //Make it lowercase - $hostname = strtolower($hostname); - //See if its a valid IP - $hostnameip = $this->isValid_IP($hostname); - if($hostnameip) - { - $usingip = true; - $usehost = $hostnameip; - } - else - { - $usingip = false; - $usehost = $hostname; - } - //The developer guide has lowercasing and validating IP other way round but its more efficient to - //have it this way - //Now we move onto canonicalizing the path - $pathparts = explode('/',$path); - foreach($pathparts as $key=>$value) - { - if($value=="..") - { - if($key!=0) - { - unset($pathparts[$key-1]); - unset($pathparts[$key]); - } - else - unset($pathparts[$key]); - } - elseif($value=="."||empty($value)) - unset($pathparts[$key]); - } - if(substr($path,-1,1)=="/") - $append = "/"; - else - $append = false; - $path = "/".implode("/",$pathparts); - if($append&&substr($path,-1,1)!="/") - $path .= $append; - $usehost = $this->flexURLEncode($usehost); - $path = $this->flexURLEncode($path); - $query = $this->flexURLEncode($query); - if(empty($parts['scheme'])) - $parts['scheme'] = 'http'; - $canurl = $parts['scheme'].'://'; - $realurl = $canurl; - if(!empty($parts['userinfo'])) - $realurl .= $parts['userinfo'].'@'; - $canurl .= $usehost; - $realurl .= $usehost; - if(!empty($parts['port'])) - { - $canurl .= ':'.$parts['port']; - $realurl .= ':'.$parts['port']; - } - $canurl .= $path; - $realurl .= $path; - if(substr_count($finalurl,"?")>0) - { - $canurl .= '?'.$parts['query']; - $realurl .= '?'.$parts['query']; - } - if(!empty($parts['fragment'])) - $realurl .= '#'.$parts['fragment']; - return array("GSBURL"=>$canurl,"CleanURL"=>$realurl,"Parts"=>array("Host"=>$usehost,"Path"=>$path,"Query"=>$query,"IP"=>$usingip)); - } - /*SHA-256 input (short method).*/ - function sha256($data) - { - return hash('sha256',$data); - } - /*Make Hostkeys for use in a lookup*/ - function makeHostKey($host,$usingip) - - { - if($usingip) - $hosts = array($host."/"); - - else - { - $hostparts = explode(".",$host); - if(count($hostparts)>2) - { - $backhostparts = array_reverse($hostparts); - $threeparts = array_slice($backhostparts,0,3); - $twoparts = array_slice($threeparts,0,2); - $hosts = array(implode('.',array_reverse($threeparts))."/",implode('.',array_reverse($twoparts))."/"); - } - else - $hosts = array($host."/"); - } - //Now make key & key prefix - $returnhosts = array(); - foreach($hosts as $value) - { - $fullhash = $this->sha256($value); - $returnhosts[$fullhash] = array("Host"=>$value,"Prefix"=>substr($fullhash,0,8),"Hash"=>$fullhash); - } - return $returnhosts; - } - /*Hash up a list of values from makePrefixes() (will possibly be - combined into that function at a later date*/ - function makeHashes($prefixarray) - { - if(count($prefixarray)>0) - { - $returnprefixes = array(); - foreach($prefixarray as $value) - { - $fullhash = $this->sha256($value); - $returnprefixes[$fullhash] = array("Original"=>$value,"Prefix"=>substr($fullhash,0,8),"Hash"=>$fullhash); - } - return $returnprefixes; - } - else - return false; - } - /*Make URL prefixes for use after a hostkey check*/ - function makePrefixes($host,$path,$query,$usingip) - { - $prefixes = array(); - //Exact hostname in the url - $hostcombos = array(); - $hostcombos[] = $host; - if(!$usingip) - { - $hostparts = explode('.',$host); - $backhostparts = array_reverse($hostparts); - if(count($backhostparts)>5) - $maxslice = 5; - else - $maxslice = count($backhostparts); - $topslice = array_slice($backhostparts,0,$maxslice); - while($maxslice>1) - { - $hostcombos[] = implode('.',array_reverse($topslice)); - $maxslice--; - $topslice = array_slice($backhostparts,0,$maxslice); - } - } - else - $hostcombos[] = $host; - $hostcombos = array_unique($hostcombos); - $variations = array(); - if(!empty($path)) - { - $pathparts = explode("/",$path); - if(count($pathparts)>4) - $upperlimit = 4; - else - $upperlimit = count($pathparts); - } - foreach($hostcombos as $key=>$value) - { - if(!empty($query)) - $variations[] = $value.$path.'?'.$query; - $variations[] = $value.$path; - if(!empty($path)) - { - $i = 0; - $pathiparts = ""; - while($i<$upperlimit) - { - if($i!=count($pathparts)-1) - $pathiparts = $pathiparts.$pathparts[$i]."/"; - else - $pathiparts = $pathiparts.$pathparts[$i]; - $variations[] = $value.$pathiparts; - $i++; - } - } - } - $variations = array_unique($variations); - return $this->makeHashes($variations); - } - /*Process data provided from the response of a full-hash GSB - request*/ - function processFullLookup($data) - { - $clonedata = $data; - $extracthash = array(); - while(strlen($clonedata)>0) - { - $splithead = explode("\n",$clonedata,2); - $chunkinfo = explode(':',$splithead[0]); - $listname = $chunkinfo[0]; - $addchunk = $chunkinfo[1]; - $chunklen = $chunkinfo[2]; - $chunkdata = bin2hex(substr($splithead[1],0,$chunklen)); - while(strlen($chunkdata)>0) - { - $extracthash[$listname][$addchunk] = substr($chunkdata,0,64); - $chunkdata = substr($chunkdata,64); - } - $clonedata = substr($splithead[1],$chunklen); - } - return $extracthash; - } - /*Add a full-hash key to a prefix or hostkey (the variable is $prefix but it could - be either).*/ - function addFullHash($prefix,$chunknum,$fullhash,$listname) - { - $buildtrunk = $listname."-a"; - //First check hosts - $result = mysql_query("SELECT * FROM `$buildtrunk-hosts` WHERE `Hostkey` = '$prefix' AND `Chunknum` = '$chunknum'"); - if($result&&mysql_num_rows($result)>0) - { - while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - if(empty($row['FullHash'])) - { - //We've got a live one! Insert the full hash for it - $addresult = mysql_query("UPDATE `$buildtrunk-hosts` SET `FullHash` = '$fullhash' WHERE `ID` = '{$row['ID']}';"); - if(!$addresult) - $this->fatalerror("Could not cache full-hash key. $prefix, $chunknum, $fullhash, $listname"); - } - } - } - else - { - //If there are no rows it must be a prefix - $result = mysql_query("SELECT * FROM `$buildtrunk-prefixes` WHERE `Prefix` = '$prefix'"); - while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - if(empty($row['FullHash'])) - { - $resulttwo = mysql_query("SELECT * FROM `$buildtrunk-hosts` WHERE `Hostkey` = '{$row['Hostkey']}' AND `Chunknum` = '$chunknum'"); - while ($rowtwo = mysql_fetch_array($resulttwo, MYSQL_ASSOC)) - { - if(hexdec($rowtwo['Count'])>0) - { - $addresult = mysql_query("UPDATE `$buildtrunk-prefixes` SET `FullHash` = '$fullhash' WHERE `ID` = '{$row['ID']}';"); - if(!$addresult) - $this->fatalerror("Could not cache full-hash key. $prefix, $chunknum, $fullhash, $listname"); - } - } - } - } - } - - } - /*Check database for any cached full-length hashes for a given prefix.*/ - function cacheCheck($prefix) - { - foreach($this->usinglists as $value) - { - $buildtrunk = $value."-a"; - $result = mysql_query("SELECT * FROM `$buildtrunk-hosts` WHERE `Hostkey` = '$prefix' AND `FullHash` != ''"); - if($result&&mysql_num_rows($result)>0) - { - while($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - return array($row['FullHash'],$row['Chunknum']); - } - } - else - { - $result = mysql_query("SELECT * FROM `$buildtrunk-prefixes` WHERE `Prefix` = '$prefix' AND `FullHash` != ''"); - if($result&&mysql_num_rows($result)>0) - { - while($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - $resulttwo = mysql_query("SELECT * FROM `$buildtrunk-hosts` WHERE `Hostkey` = '{$row['Hostkey']}'"); - while ($rowtwo = mysql_fetch_array($resulttwo, MYSQL_ASSOC)) - { - if(hexdec($rowtwo['Count'])>0) - { - return array($row['FullHash'],$rowtwo['Chunknum']); - } - - } - } - } - } - } - return false; - } - /*Do a full-hash lookup based on prefixes provided, returns (bool) true - on a match and (bool) false on no match.*/ - function doFullLookup($prefixes,$originals) - { - //Store copy of original prefixes - $cloneprefixes = $prefixes; - //They should really all have the same prefix size.. we'll just check one - $prefixsize = strlen($prefixes[0][0])/2; - $length = count($prefixes)*$prefixsize; - foreach($prefixes as $key=>$value) - { - //Check cache on each iteration (we can return true earlier if we get a match!) - $cachechk = $this->cacheCheck($value[0]); - if($cachechk) - { - if(isset($originals[$cachechk[0]])) - { - //Check from same chunk - foreach($cloneprefixes as $nnewvalue) - { - if($nnewvalue[1]==$cachechk[1]&&$value[0]==$originals[$cachechk[0]]['Prefix']) - { - //From same chunks - return true; - } - - } - } - } - $prefixes[$key] = pack("H*",$value[0]); - } - //No cache matches so we continue with request - $body = "$prefixsize:$length\n".implode("",$prefixes); - - $buildopts = array(CURLOPT_POST=>true,CURLOPT_POSTFIELDS=>$body); - $result = $this->googleDownloader("http://safebrowsing.clients.google.com/safebrowsing/gethash?client=api&apikey=".$this->apikey."&appver=".$this->version."&pver=".$this->apiversion,$buildopts,"lookup"); - - if($result[0]['http_code']==200&&!empty($result[1])) - { - //Extract hashes from response - $extractedhashes = $this->processFullLookup($result[1]); - //Loop over each list - foreach($extractedhashes as $key=>$value) - { - //Loop over each value in each list - foreach($value as $newkey=>$newvalue) - { - if(isset($originals[$newvalue])) - { - //Okay it matches a full-hash we have, now to check they're from the same chunks - foreach($cloneprefixes as $nnewvalue) - { - if($nnewvalue[1]==$newkey&&$nnewvalue[0]==$originals[$newvalue]['Prefix']) - { - //From same chunks - //Add full hash to database (cache) - $this->addFullHash($nnewvalue[0],$nnewvalue[1],$newvalue,$key); - return true; - } - - } - } - } - } - return false; - } - elseif($result[0]['http_code']==204&&strlen($result[1])==0) - { - //204 Means no match - return false; - } - else - { - //"No No No! This just doesn't add up at all!" - $this->fatalerror("ERROR: Invalid response returned from GSB ({$result[0]['http_code']})"); - } - } - /*Checks to see if a match for a prefix is found in the sub table, if it is then we won't do - a full-hash lookup. Return true on match in sub list, return false on negative.*/ - function subCheck($listname,$prefixlist,$mode) - { - $buildtrunk = $listname.'-s'; - if($mode=="prefix") - { - //Mode is prefix so the add part was a prefix, not a hostkey so we just check prefixes (saves a lookup) - foreach($prefixlist as $value) - { - $result = mysql_query("SELECT * FROM `$buildtrunk-prefixes` WHERE `Prefix` = '{$value[0]}'"); - if($result&&mysql_num_rows($result)>0) - { - //As interpreted from Developer Guide if theres a match in sub list it cancels out the add listing - //we'll double check its from the same chunk just to be pedantic - while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - if(hexdec($row['AddChunkNum'])==$value[1]) - return true; - } - } - - } - return false; - } - elseif($mode=="hostkey") - { - //Mode is hostkey - foreach($prefixlist as $value) - { - $result = mysql_query("SELECT * FROM `$buildtrunk-prefixes` WHERE `Hostkey` = '{$value[0]}'"); - if($result&&mysql_num_rows($result)>0) - { - //As interpreted from Developer Guide if theres a match in sub list it cancels out the add listing - //we'll double check its from the same chunk just to be pedantic - while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - if(hexdec($row['AddChunkNum'])==$value[1]&&empty($row['Prefix'])) - return true; - } - } - - } - return false; - } - $this->fatalerror("Invalid SubCheck Mode $mode"); - } - /*Does a full URL lookup on given lists, will check if its in database, if slight match there then - will do a full-hash lookup on GSB, returns (bool) true on match and (bool) false on negative.*/ - function doLookup($url) - { - $lists = $this->usinglists; - //First canonicalize the URL - $canurl = $this->Canonicalize($url); - //Make hostkeys - $hostkeys = $this->makeHostKey($canurl['Parts']['Host'],$canurl['Parts']['IP']); - $matches = array(); - foreach($lists as $key=>$value) - { - $buildtrunk = $value.'-a'; - //Loop over each list - foreach($hostkeys as $keyinner=>$valueinner) - { - //Within each list loop over each hostkey - $result = mysql_query("SELECT * FROM `$buildtrunk-hosts` WHERE `Hostkey` = '{$valueinner['Prefix']}'"); - if($result&&mysql_num_rows($result)>0) - { - //For each hostkey match - while ($row = mysql_fetch_array($result, MYSQL_ASSOC)) - { - $nicecount = hexdec($row['Count']); - if($nicecount>0) - { - //There was a match and the count is more than one so there are prefixes! - //Hash up a load of prefixes and create the build query if we haven't done so already - if(!isset($prefixes)) - { - $prefixes = $this->makePrefixes($canurl['Parts']['Host'],$canurl['Parts']['Path'],$canurl['Parts']['Query'],$canurl['Parts']['IP']); - $buildprequery = array(); - foreach($prefixes as $prefix) - { - $buildprequery[] = " `Prefix` = '{$prefix['Prefix']}' "; - } - $buildprequery = implode("OR",$buildprequery); - } - //Check if there are any matching prefixes - $resulttwo = mysql_query("SELECT * FROM `$buildtrunk-prefixes` WHERE ($buildprequery) AND `Hostkey` = '{$row['Hostkey']}'"); - if($resulttwo&&mysql_num_rows($resulttwo)>0) - { - //We found prefix matches - $prematches = array(); - $prelookup = array(); - while ($rowtwo = mysql_fetch_array($resulttwo, MYSQL_ASSOC)) - { - $prematches[] = array($rowtwo['Prefix'],$row['Chunknum']); - } - //Before we send off any requests first check whether its in sub table - $subchk = $this->subCheck($value,$prematches,"prefix"); - if(!$subchk) - { - //Send off any matching prefixes to do some full-hash key checks - $flookup = $this->doFullLookup($prematches,$prefixes); - if($flookup) - return true; - } - } - //If we didn't find matches then do nothing (keep looping till end and it'll return negative) - } - else - { - $subchk = $this->subCheck($value,array(array($row['Hostkey'],$row['Chunknum'])),"hostkey"); - if(!$subchk) - { - //There was a match but the count was 0 that entire domain could be a match, Send off to check - $flookup = $this->doFullLookup(array(array($row['Hostkey'],$row['Chunknum'])),$hostkeys); - if($flookup) - return true; - } - } - } - } - } - } - return false; - - } - } -?> \ No newline at end of file +dbConnect($database, $username, $password, $host); + } + $this->verbose = $verbose; + } + + /** + * Get url to service resource with parameters + * + * @param string $resource + * @return string + */ + public function getServiceUrl($resource = '') { + return $this->serviceScheme . '://' . $this->serviceDomain . '/' . $this->serviceResourcePrefix . + $resource . '?client=api&apikey=' . $this->apikey . '&appver=' . $this->version . '&pver=' . $this->apiversion; + } + + public function setService($domain, $resource_prefix = '', $scheme = 'https') { + $this->serviceDomain = $domain; + $this->serviceScheme = $scheme; + $this->serviceResourcePrefix = $resource_prefix; + } + + public function __destruct() { + $this->close(); + } + + private function close() { + $this->log("Closing phpGSB. (Peak Memory: " . (round(memory_get_peak_usage() / 1048576, 3)) . "MB)"); + } + + public function silent() { + $this->verbose = false; + } + + public function enableDebug() { + $this->debug = true; + } + + public function resetDebugLog() { + $this->debugLog = array(); + } + + public function setApiKey($apikey) { + $this->apikey = $apikey; + } + + public function trans_disable() { + $this->transenabled = false; + } + + public function trans_enable() { + $this->transenabled = true; + } + + private function trans_begin() { + if ($this->transenabled) { + $this->transtarted = true; + $this->log("Begin MySQL Transaction"); + $this->db->query('START TRANSACTION;'); + } + } + + private function trans_commit() { + if ($this->transtarted && $this->transenabled) { + $this->transtarted = false; + $this->log("Comitting Transaction"); + $this->db->query('COMMIT;'); + } + } + + private function trans_rollback() { + if ($this->transtarted && $this->transenabled) { + $this->transtarted = false; + $this->log("Rolling Back Transaction"); + $this->db->query('ROLLBACK;'); + } + } + + /** + * Function to output messages, used instead of echo, + * will make it easier to have a verbose switch in later releases + */ + private function log($msg) { + if ($this->verbose) { + echo $msg . "\n"; + } + } + + /** + * Function to output errors, used instead of echo, + * will make it easier to have a verbose switch in later releases + */ + private function fatalerror($msg) { + if ($this->verbose) { + print_r($msg); + echo "\n"; + } + + $this->trans_rollback(); + throw new Exception($msg); + } + + /** + * Wrapper to connect to database. + */ + private function dbConnect($database, $username, $password, $host = "localhost") { + $this->db = new PDO('mysql:host=' . $host . ';dbname=' . $database, + $username, + $password + ); + + $this->db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION); + } + + /** + * Simple logic function to calculate timeout based on the number of previous errors + */ + private function calc($errors) { + // According to Developer Guide Formula + if ($errors == 1) { + // According to Developer Guide (1st error, wait a minute) + return 60; + } elseif ($errors > 5) { + // According to Developer Guide (Above 5 errors check every 4 hours) + return 28800; + } else { + // According to Developer Guide we simply double up our timeout each + // time and use formula: + // (Adapted to be relative to errors) ( ((2^$errors) * 7.5) * + // (decimalrand(0,1) + 1)) to produce + // a result between: 120min-240min for example + return floor((pow(2, $errors) * 7.5) * ((rand(0, 1000) / 1000) + 1)); + } + } + + /** + * Writes backoff timeouts, uses calc() to calculate timeouts and then writes to file + * for next check + */ + private function Backoff($errdata = false, $type) { + $file = ($type == 'data' ? 'nextcheck.dat' : 'nextcheckl.dat'); + + $curstatus = explode('||', file_get_contents($this->pingfilepath . $file)); + $curstatus[1] = $curstatus[1] + 1; + $seconds = $this->calc($curstatus[1]); + $until = time() + $seconds . '||' . $curstatus[1]; + file_put_contents($this->pingfilepath . $file, $until); + $this->fatalerror(array( + "Invalid Response... Backing Off", + $errdata + )); + } + + /** + * Writes timeout from valid requests to nextcheck file + */ + private function setTimeout($seconds) { + if (file_exists($this->pingfilepath . 'nextcheck.dat')) { + $curstatus = explode('||', @file_get_contents($this->pingfilepath . 'nextcheck.dat')); + $until = time() + $seconds . '||' . $curstatus[1]; + } else { + $until = time() + $seconds . '||'; + } + + file_put_contents($this->pingfilepath . 'nextcheck.dat', $until); + } + + /** + * Checks timeout in timeout files (usually performed at the + * start of script) + */ + private function checkTimeout($type) { + $file = ($type == 'data' ? 'nextcheck.dat' : 'nextcheckl.dat'); + + $curstatus = explode('||', @file_get_contents($this->pingfilepath . $file)); + if (time() < $curstatus[0]) { + $this->fatalerror("Must wait another " . ($curstatus[0] - time()) . " seconds before another request"); + } + + $this->log("Allowed to request"); + } + + /** + * Function downloads from URL's, POST data can be + * passed via $options. $followbackoff indicates + * whether to follow backoff procedures or not + */ + private function download($url, $options = NULL, $followbackoff = false) { + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_HEADER, 0); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + + if (is_array($options)) { + curl_setopt_array($ch, $options); + } + + $data = curl_exec($ch); + $info = curl_getinfo($ch); + curl_close($ch); + + if ($followbackoff && $info['http_code'] > 299) { + $this->Backoff($info, $followbackoff); + } + + return array( + $info, + $data + ); + } + + //UPDATER FUNCTIONS + + /** + * Resets lists database, only called if GSB issues r:resetdatabase + */ + private function resetDatabase() { + // Lord knows why they would EVER issue this request! + if (!empty($this->adminemail)) { + mail($this->adminemail, 'Reset Database Request Issued', 'For some crazy unknown reason GSB requested a database reset at ' . time()); + } + + foreach ($this->usinglists as $value) { + $this->query("TRUNCATE TABLE `$value-s-index`"); + $this->query("TRUNCATE TABLE `$value-s-hosts`"); + $this->query("TRUNCATE TABLE `$value-s-prefixes`"); + $this->query("TRUNCATE TABLE `$value-a-index`"); + $this->query("TRUNCATE TABLE `$value-a-hosts`"); + $this->query("TRUNCATE TABLE `$value-a-prefixes`"); + } + } + + /** + * Processes data recieved from a GSB data request into a managable array + */ + private function processChunks($data, $listname) { + $len = strlen($data); + $offset = 0; + while ($offset < $len) { + $x = strpos($data, ':', $offset); + $type = substr($data, $offset, $x-$offset); + + $offset = $x+1; + $x = strpos($data, ':', $offset); + $chunknum = substr($data, $offset, $x-$offset); + $offset = $x+1; + if (!is_numeric($chunknum)) { + $this->fatalerror(array( + "Decoding Error, chunknum is not numeric!", + $chunknum + )); + } + + $x = strpos($data, ':', $offset); + $hashlen = substr($data, $offset, $x-$offset); + $offset = $x+1; + if (!is_numeric($hashlen)) { + $this->fatalerror(array( + "Decoding Error, hashlen is not numeric!", + $hashlen + )); + } + $x = strpos($data, "\n", $offset); + $chunklen = substr($data, $offset, $x-$offset); + $offset = $x+1; + $chunkdata = NULL; + if (!is_numeric($chunklen)) { + $this->fatalerror(array( + "Decoding Error, chunklen is not numeric!", + $chunklen + )); + } + if ($chunklen > 0) { + $chunkdata = bin2hex(substr($data, $offset, $chunklen)); + $offset += $chunklen; + } + + if ($type != 'a' && $type != 's') { + $this->log("DISCARDED CHUNKNUM: $chunknum (Had no valid label)"); + continue; + } + + $dataArr = array( + 'chunknum' => $chunknum, + 'hashlen' => $hashlen, + 'chunklen' => $chunklen, + 'real' => array() + ); + + $chunkOffset = 0; + while ($chunkOffset < $chunklen) { + $row = array( + 'hostkey' => substr($chunkdata, $chunkOffset, 8), + 'count' => hexdec(substr($chunkdata, $chunkOffset+8, 2)), + 'pairs' => array() + ); + + $chunkOffset += 10; + if ($row['count'] > 0) { + for ($i = 0; $i < $row['count']; $i++) { + $pair = array(); + if ($type == 's') { + $pair['addchunknum'] = hexdec(substr($chunkdata, $chunkOffset, 8)); + $chunkOffset += 8; + } + $pair['prefix'] = substr($chunkdata, $chunkOffset, ($hashlen * 2)); + $chunkOffset += ($hashlen * 2); + $row['pairs'][] = $pair; + } + } elseif ($row['count'] == 0 && $type == 's') { + $row['pairs'][] = array( + 'addchunknum' => hexdec(substr($chunkdata, $chunkOffset, 8)) + ); + $chunkOffset += 8; + } elseif ($row['count'] < 0) { + $this->fatalerror(array( + "Decoding Error, Somethings gone wrong!", + array($row, $type) + )); + } + $dataArr['real'][] = $row; + } + $this->saveChunkPart($dataArr, ($type == 's' ? 'SUB' : "ADD"), $listname); + unset($dataArr); + } + return true; + } + + /** + * Saves processed data to the MySQL database + */ + private function saveChunkPart($data, $type, $listname) { + $buildindex = array(); + $buildindexValues = array(); + $buildhost = array(); + $buildhostValues = array(); + $buildpairs = array(); + $buildpairsValues = array(); + + //Check what type of data it is... + if ($type == "SUB") { + $listtype = 's'; + } elseif ($type == "ADD") { + $listtype = 'a'; + } else { + $this->fatalerror(array( + "Invalid type given!", + $type + )); + } + + if (!isset($this->mainlist[$listtype][$listname][$data['chunknum']])) { + $this->mainlist[$listtype][$listname][$data['chunknum']] = true; + $buildindex[] = "(?, ?)"; + $buildindexValues[] = $data['chunknum']; + $buildindexValues[] = $data['chunklen']; + + foreach ($data['real'] as $newkey => $newvalue) { + $buildhost[] = "(x?, ?, ?, '')"; + $buildhostValues[] = $newvalue['hostkey']; + $buildhostValues[] = $data['chunknum']; + $buildhostValues[] = $newvalue['count']; + foreach ($newvalue['pairs'] as $innerkey => $innervalue) { + $buildpairs[] = "(x?, " . ($type == 'SUB' ? '?, ' : '') . "x?, '')"; + $buildpairsValues[] = $newvalue['hostkey']; + if ($type == 'SUB') { + $buildpairsValues[] = $innervalue['addchunknum']; + } + + $buildpairsValues[] = (isset($innervalue['prefix']) ? $innervalue['prefix'] : ''); + } + } + } + + + if (!empty($buildindex)) { + //Insert index value + $this->query('INSERT IGNORE INTO `' . $listname . '-' . $listtype. '-index` (`chunk_num`,`chunk_len`) VALUES ' . implode(',', $buildindex), $buildindexValues); + } + + if (!empty($buildhost)) { + //Insert index value + $this->query('INSERT IGNORE INTO `' . $listname . '-' . $listtype. '-hosts` (`hostkey`,`chunk_num`,`count`,`fullhash`) VALUES ' . implode(',', $buildhost), $buildhostValues); + } + + if (!empty($buildpairs)) { + //Insert index value + $this->query('INSERT IGNORE INTO `' . $listname . '-' . $listtype. '-prefixes` (`hostkey`, ' . + ($type == 'SUB' ? '`add_chunk_num`, ' : '') . '`prefix`,`fullhash`) VALUES ' . + implode(',', $buildpairs), $buildpairsValues); + } + } + + /** + * Get ranges of existing chunks from a requested list + * and type (add [a] or sub [s] return them and set + * mainlist to recieved for that chunk (prevent dupes) + */ + private function getRanges($listname, $mode) { + $checktable = $listname . '-' . $mode . '-index'; + + $ranges = array(); + $i = 0; + $start = 0; + $stm = $this->query('SELECT chunk_num FROM `' . $checktable . '` ORDER BY `chunk_num` ASC'); + while ($row = $stm->fetch(\PDO::FETCH_ASSOC)) { + $this->mainlist[$mode][$listname][$row['chunk_num']] = true; + if ($i == 0) { + $start = $row['chunk_num']; + $previous = $row['chunk_num']; + } else { + $expected = $previous + 1; + if ($row['chunk_num'] != $expected) { + if ($start == $previous) { + $ranges[] = $start; + } else { + $ranges[] = $start . '-' . $previous; + } + $start = $row['chunk_num']; + } + $previous = $row['chunk_num']; + } + $i++; + } + + if ($start > 0 && $previous > 0) { + if ($start == $previous) { + $ranges[] = $start; + } else { + $ranges[] = $start . '-' . $previous; + } + } + return $ranges; + } + + /** + * Get both add and sub ranges for a requested list + */ + private function getFullRanges($listname) { + $subranges = $this->getRanges($listname, 's'); + $addranges = $this->getRanges($listname, 'a'); + return array( + "Subranges" => $subranges, + "Addranges" => $addranges + ); + } + + /** + * Format a full request body for a desired list including + * name and full ranges for add and sub + */ + private function formattedRequest($listname) { + $fullranges = $this->getFullRanges($listname); + $buildpart = ''; + + if (count($fullranges['Subranges']) > 0) { + $buildpart .= 's:' . implode(',', $fullranges['Subranges']); + } + + if (count($fullranges['Subranges']) > 0 && count($fullranges['Addranges']) > 0) { + $buildpart .= ':'; + } + + if (count($fullranges['Addranges']) > 0) { + $buildpart .= 'a:' . implode(',', $fullranges['Addranges']); + } + + return $listname . ';' . $buildpart . "\n"; + } + + /** + * Called when GSB returns a SUB-DEL or ADD-DEL response + */ + private function deleteRange($range, $mode, $listname) { + $params = array(); + $buildtrunk = $listname . '-' . $mode; + if (strpos($range, '-') !== false) { + $params = explode('-', trim($range), 2); + $clause = "`chunk_num` >= ? AND `chunk_num` <= ?"; + } else { + $params[] = $range; + $clause = "`chunk_num` = ?"; + } + + // Delete from index + $this->query('DELETE FROM `' . $buildtrunk . '-index` WHERE ' . $clause, $params); + + // Select all host keys that match chunks (we'll delete them after but we + // need the hostkeys list!) + $stm = $this->query('SELECT HEX(`hostkey`) hostkey FROM `' . $buildtrunk . '-hosts` WHERE ' . $clause . " AND hostkey != ''", $params); + $buildprefixdel = array(); + while ($row = $stm->fetch(\PDO::FETCH_ASSOC)) { + $buildprefixdel[] = $row['hostkey']; + } + + if (!empty($buildprefixdel)) { + $this->query('DELETE FROM `' . $buildtrunk . '-hosts` WHERE hostkey IN (' . substr(str_repeat('x?, ', count($buildprefixdel)), 0, -2) . ')', $buildprefixdel); + + //Delete all matching hostkeys + $this->query('DELETE FROM `' . $buildtrunk . '-hosts` WHERE ' . $clause, $params); + } + } + + public function getList() { + $url = $this->getServiceUrl('list'); + $result = $this->download($url); + return explode("\n", trim($result[1])); + } + + /** + * Main part of updater function, will call all other functions, merely + * requires the request body, it will then process and save all data as well as checking + * for ADD-DEL and SUB-DEL, runs silently so won't return anything on success + */ + private function getData($body, $skipSetTimeout = false) { + if (empty($body)) { + return $this->fatalerror("Missing a body for data request"); + } + + $this->trans_begin(); + $buildopts = array( + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body . "\n" + ); + + $url = $this->getServiceUrl('downloads'); + $result = $this->download($url, $buildopts, "data"); + + if (preg_match('/n:(\d+)/', $result[1], $match)) { + if (!$skipSetTimeout) { + $this->setTimeout($match[1]); + } + } else { + return $this->fatalerror("Missing timeout"); + } + + if (strpos($result[1], 'r:pleasereset') !== false) { + $this->resetDatabase(); + return true; + } + + if (!preg_match_all('/i:(.+?)\n(.+?)(?=i:|$)/s', $result[1], $blocks, PREG_PATTERN_ORDER)) { + $this->log('No data available in list'); + return true; + } + + foreach ($blocks[1] as $id => $listname) { + if (!preg_match_all('/\s*([^:]+):(.+)/', $blocks[2][$id], $elements, PREG_PATTERN_ORDER)) { + return $this->fatalerror('could not parse response'); + } + + foreach ($elements[1] as $id => $type) { + $value = trim($elements[2][$id]); + switch($type) { + case 'u': + $chunkdata = $this->download('http://' . $value, false, "data"); + $processed = $this->processChunks($chunkdata[1], $listname); + $this->log("Saved a chunk file: " . $value); + break; + case 'sd': + case 'ad': + $delType = substr($type, 0, 1); + foreach (explode(',', $value) as $keyadd => $valueadd) { + $this->deleteRange($valueadd, $delType, $listname); + } + break; + } + } + } + + $this->trans_commit(); + return true; + } + + /** + * Shortcut to run updater + */ + public function runUpdate($skipCheckTimeout = false, $skipSetTimeout = false) { + if (!$skipCheckTimeout) { + $this->checkTimeout('data'); + } + $require = ""; + foreach ($this->usinglists as $value) { + $require .= $this->formattedRequest($value); + } + + $this->log("Using $require"); + $this->getData($require, $skipSetTimeout); + } + + //LOOKUP FUNCTIONS + /** + * Used to check the canonicalize function + */ + public function validateMethod() { + //Input => Expected + $cases = array( + "http://host/%25%32%35" => "http://host/%25", + "http://host/%25%32%35%25%32%35" => "http://host/%25%25", + "http://host/%2525252525252525" => "http://host/%25", + "http://host/asdf%25%32%35asd" => "http://host/asdf%25asd", + "http://host/%%%25%32%35asd%%" => "http://host/%25%25%25asd%25%25", + "http://www.google.com/" => "http://www.google.com/", + "http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/" => "http://168.188.99.26/.secure/www.ebay.com/", + "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/" => "http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/", + "http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B" => 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+', + "http://3279880203/blah" => "http://195.127.0.11/blah", + "http://www.google.com/blah/.." => "http://www.google.com/", + "www.google.com/" => "http://www.google.com/", + "www.google.com" => "http://www.google.com/", + "http://www.evil.com/blah#frag" => "http://www.evil.com/blah", + "http://www.GOOgle.com/" => "http://www.google.com/", + "http://www.google.com.../" => "http://www.google.com/", + "http://www.google.com/foo\tbar\rbaz\n2" => "http://www.google.com/foobarbaz2", + "http://www.google.com/q?" => "http://www.google.com/q?", + "http://www.google.com/q?r?" => "http://www.google.com/q?r?", + "http://www.google.com/q?r?s" => "http://www.google.com/q?r?s", + "http://evil.com/foo#bar#baz" => "http://evil.com/foo", + "http://evil.com/foo;" => "http://evil.com/foo;", + "http://evil.com/foo?bar;" => "http://evil.com/foo?bar;", + "http://\x01\x80.com/" => "http://%01%80.com/", + "http://notrailingslash.com" => "http://notrailingslash.com/", + "http://www.gotaport.com:1234/" => "http://www.gotaport.com:1234/", + " http://www.google.com/ " => "http://www.google.com/", + "http:// leadingspace.com/" => "http://%20leadingspace.com/", + "http://%20leadingspace.com/" => "http://%20leadingspace.com/", + "%20leadingspace.com/" => "http://%20leadingspace.com/", + "https://www.securesite.com/" => "https://www.securesite.com/", + "http://host.com/ab%23cd" => "http://host.com/ab%23cd", + "http://host.com//twoslashes?more//slashes" => "http://host.com/twoslashes?more//slashes" + ); + + foreach ($cases as $key => $value) { + $canit = self::canonicalizeURL($key); + $canit = $canit['GSBURL']; + if ($canit == $value) { + $this->log("PASSED: $key"); + } else { + $this->log("INVALid:
ORIGINAL: $key
EXPECTED: $value
RECIEVED: $canit
"); + } + } + } + + /** + * Special thanks Steven Levithan (stevenlevithan.com) for the ridiculously complicated regex + * required to parse urls. This is used over parse_url as it robustly provides access to + * port, userinfo etc and handles mangled urls very well. + * + * Expertly integrated into phpGSB by Sam Cleaver ;) + * Thanks to mikegillis677 for finding the seg. fault issue in the old function. + * Passed validateMethod() check on 17/01/12 + */ + private static function j_parseUrl($url) { + $strict = '/^(?:([^:\/?#]+):)?(?:\/\/\/?((?:(([^:@]*):?([^:@]*))?@)?([^:\/?#]*)(?::(\d*))?))?(((?:\/(\w:))?((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/'; + $loose = '/^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/\/?)?((?:(([^:@]*):?([^:@]*))?@)?([^:\/?#]*)(?::(\d*))?)(((?:\/(\w:))?(\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/'; + preg_match($loose, $url, $match); + if (empty($match)) { + //As odd as its sounds, we'll fall back to strict (as technically its + // more correct and so may salvage completely mangled urls) + unset($match); + preg_match($strict, $url, $match); + } + $parts = array( + "source" => '', + "scheme" => '', + "authority" => '', + "userinfo" => '', + "user" => '', + "password" => '', + "host" => '', + "port" => '', + "relative" => '', + "path" => '', + "drive" => '', + "directory" => '', + "file" => '', + "query" => '', + "fragment" => '' + ); + switch (count ($match)) { + case 15 : + $parts['fragment'] = $match[14]; + case 14 : + $parts['query'] = $match[13]; + case 13 : + $parts['file'] = $match[12]; + case 12 : + $parts['directory'] = $match[11]; + case 11 : + $parts['drive'] = $match[10]; + case 10 : + $parts['path'] = $match[9]; + case 9 : + $parts['relative'] = $match[8]; + case 8 : + $parts['port'] = $match[7]; + case 7 : + $parts['host'] = $match[6]; + case 6 : + $parts['password'] = $match[5]; + case 5 : + $parts['user'] = $match[4]; + case 4 : + $parts['userinfo'] = $match[3]; + case 3 : + $parts['authority'] = $match[2]; + case 2 : + $parts['scheme'] = $match[1]; + case 1 : + $parts['source'] = $match[0]; + } + return $parts; + } + + /** + * Regex to check if its a numerical IP address + */ + private static function is_ip($ip) { + return preg_match("/^([1-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" . "(\.([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}$/", $ip); + } + + /** + * Checks if input is in hex format + */ + private static function is_hex($x) { + // Relys on the fact that hex often includes letters meaning PHP will + // disregard the string + if (($x + 3) == 3) { + return dechex(hexdec($x)) == $x; + } + + return false; + } + + /** + * Checks if input is in octal format + */ + private static function is_octal($x) { + //Relys on the fact that in IP addressing octals must begin with a 0 to + // denote octal + return substr($x, 0, 1) == 0; + } + + /** + * Converts hex or octal input into decimal + */ + private static function hexoct2dec($value) { + //As this deals with parts in IP's we can be more exclusive + if (substr_count(substr($value, 0, 2), '0x') > 0 && self::is_hex($value)) { + return hexdec($value); + } elseif (self::is_octal($value)) { + return octdec($value); + } + + return false; + } + + /** + * Converts IP address part in HEX to decimal + */ + private static function iphexdec($hex) { + //Removes any leading 0x (used to denote hex) and then and leading 0's) + $temp = str_replace('0x', '', $hex); + $temp = ltrim($temp, "0"); + return hexdec($temp); + } + + /** + * Converts full IP address in HEX to decimal + */ + private static function hexIPtoIP($hex) { + // Remove hex identifier and leading 0's (not significant) + $tempip = str_replace('0x', '', $hex); + $tempip = ltrim($tempip, "0"); + + // It might be hex + if (self::is_hex($tempip)) { + // There may be a load of junk before the part we need + if (strlen($tempip) > 8) { + $tempip = substr($tempip, -8); + } + $hexplode = preg_split('//', $tempip, -1, PREG_SPLIT_NO_EMPTY); + while (count($hexplode) < 8) { + array_unshift($hexplode, 0); + } + + // Normalise + $newip = hexdec($hexplode[0] . $hexplode[1]) . '.' . + hexdec($hexplode[2] . $hexplode[3]) . '.' . + hexdec($hexplode[4] . $hexplode[5]) . '.' . + hexdec($hexplode[6] . $hexplode[7]); + //Now check if its an IP + if (self::is_ip($newip)) { + return $newip; + } + } + return false; + } + + /** + * Checks if an IP provided in either hex, octal or decimal is in fact + * an IP address. Normalises to a four part IP address. + */ + private static function isValid_IP($ip) { + // First do a simple check, if it passes this no more needs to be done + if (self::is_ip($ip)) { + return $ip; + } + + // Its a toughy... eerm perhaps its all in hex? + $checkhex = self::hexIPtoIP($ip); + if ($checkhex) { + return $checkhex; + } + + // If we're still here it wasn't hex... maybe a DWORD format? + $checkdword = self::hexIPtoIP(dechex($ip)); + if ($checkdword) { + return $checkdword; + } + + // Nope... maybe in octal or a combination of standard, octal and hex?! + $ipcomponents = explode('.', $ip); + $ipcomponents[0] = self::hexoct2dec($ipcomponents[0]); + if (count($ipcomponents) == 2) { + // The writers of the RFC docs certainly didn't think about the + // clients! This could be a DWORD mixed with an IP part + if ($ipcomponents[0] <= 255 && is_int($ipcomponents[0]) && is_int($ipcomponents[1])) { + $threeparts = dechex($ipcomponents[1]); + $hexplode = preg_split('//', $threeparts, -1, PREG_SPLIT_NO_EMPTY); + if (count($hexplode) > 4) { + $newip = $ipcomponents[0] . '.' . + self::iphexdec($hexplode[0] . $hexplode[1]) . '.' . + self::iphexdec($hexplode[2] . $hexplode[3]) . '.' . + self::iphexdec($hexplode[4] . $hexplode[5]); + // Now check if its valid + if (self::is_ip($newip)) { + return $newip; + } + } + } + } + + $ipcomponents[1] = self::hexoct2dec($ipcomponents[1]); + if (count($ipcomponents) == 3) { + //Guess what... it could also be a DWORD mixed with two IP parts! + if (($ipcomponents[0] <= 255 && is_int($ipcomponents[0])) && ($ipcomponents[1] <= 255 && is_int($ipcomponents[1])) && is_int($ipcomponents[2])) { + $twoparts = dechex($ipcomponents[2]); + $hexplode = preg_split('//', $twoparts, -1, PREG_SPLIT_NO_EMPTY); + if (count($hexplode) > 3) { + $newip = $ipcomponents[0] . '.' . + $ipcomponents[1] . '.' . + self::iphexdec($hexplode[0] . $hexplode[1]) . '.' . + self::iphexdec($hexplode[2] . $hexplode[3]); + // Now check if its valid + if ($this->is_ip($newip)) + return $newip; + } + } + } + // If not it may be a combination of hex and octal + if (count($ipcomponents) >= 4) { + $tmpcomponents = array( + $ipcomponents[2], + $ipcomponents[3] + ); + + foreach ($tmpcomponents as $key => $value) { + if (!$tmpcomponents[$key] = self::hexoct2dec($value)) { + return false; + } + } + + array_unshift($tmpcomponents, $ipcomponents[0], $ipcomponents[1]); + // Convert back to IP form + $newip = implode('.', $tmpcomponents); + + // Now check if its valid + if (self::is_ip($newip)) { + return $newip; + } + } + + // Well its not an IP that we can recognise... theres only so much we can + // do! + return false; + } + + /** + * Had to write another layer as built in PHP urlencode() escapes all non + * alpha-numeric Google states to only urlencode if its below 32 or above + * or equal to 127 (some of those are non alpha-numeric and so urlencode + * on its own won't work). + */ + private static function flexURLEncode($url, $ignorehash = false) { + // Had to write another layer as built in PHP urlencode() escapes all non + // alpha-numeric + // google states to only urlencode if its below 32 or above or equal to + // 127 (some of those + // are non alpha-numeric and so urlencode on its own won't work). + $urlchars = preg_split('//', $url, -1, PREG_SPLIT_NO_EMPTY); + if (count($urlchars) > 0) { + foreach ($urlchars as $key => $value) { + $ascii = ord($value); + if ($ascii <= 32 || $ascii >= 127 || ($value == '#' && !$ignorehash) || $value == '%') { + $urlchars[$key] = rawurlencode($value); + } + } + + return implode('', $urlchars); + } + return $url; + } + + /** + * Canonicalize a full URL according to Google's definition. + */ + public static function canonicalizeURL($url) { + // Remove line feeds, return carriages, tabs, vertical tabs + $finalurl = trim(str_replace(array( + "\x09", + "\x0A", + "\x0D", + "\x0B" + ), '', $url)); + + // URL Encode for easy extraction + $finalurl = self::flexURLEncode($finalurl, true); + + // Now extract hostname & path + $parts = self::j_parseUrl($finalurl); + $hostname = $parts['host']; + $path = $parts['path']; + $query = $parts['query']; + $lasthost = ""; + $lastpath = ""; + $lastquery = ""; + + // Remove all hex coding (loops max of 50 times to stop craziness but + // should never reach that) + for ($i = 0; $i < 50; $i++) { + $hostname = rawurldecode($hostname); + $path = rawurldecode($path); + $query = rawurldecode($query); + if ($hostname == $lasthost && $path == $lastpath && $query == $lastquery) + break; + $lasthost = $hostname; + $lastpath = $path; + $lastquery = $query; + } + + // Deal with hostname first + // Replace all leading and trailing dots + $hostname = trim($hostname, '.'); + + // Replace all consecutive dots with one dot + $hostname = preg_replace("/\.{2,}/", ".", $hostname); + + // Make it lowercase + $hostname = strtolower($hostname); + + // See if its a valid IP + $hostnameip = self::isValid_IP($hostname); + if ($hostnameip) { + $usingip = true; + $usehost = $hostnameip; + } else { + $usingip = false; + $usehost = $hostname; + } + // The developer guide has lowercasing and validating IP other way round + // but its more efficient to + // have it this way + // Now we move onto canonicalizing the path + $pathparts = explode('/', $path); + foreach ($pathparts as $key => $value) { + if ($value == "..") { + if ($key != 0) { + unset($pathparts[$key - 1]); + unset($pathparts[$key]); + } else { + unset($pathparts[$key]); + } + } elseif ($value == "." || empty($value)) { + unset($pathparts[$key]); + } + } + + if (substr($path, -1, 1) == "/") { + $append = "/"; + } else { + $append = false; + } + + $path = "/" . implode("/", $pathparts); + + if ($append && substr($path, -1, 1) != "/") { + $path .= $append; + } + + $usehost = self::flexURLEncode($usehost); + $path = self::flexURLEncode($path); + $query = self::flexURLEncode($query); + + if (empty($parts['scheme'])) { + $parts['scheme'] = 'http'; + } + + $canurl = $parts['scheme'] . '://'; + $realurl = $canurl; + + if (!empty($parts['userinfo'])) { + $realurl .= $parts['userinfo'] . '@'; + } + + $canurl .= $usehost; + $realurl .= $usehost; + + if (!empty($parts['port'])) { + $canurl .= ':' . $parts['port']; + $realurl .= ':' . $parts['port']; + } + + $canurl .= $path; + $realurl .= $path; + if (substr_count($finalurl, "?") > 0) { + $canurl .= '?' . $parts['query']; + $realurl .= '?' . $parts['query']; + } + + if (!empty($parts['fragment'])) { + $realurl .= '#' . $parts['fragment']; + } + + return array( + "GSBURL" => $canurl, + "CleanURL" => $realurl, + "Parts" => array( + "Host" => $usehost, + "Path" => $path, + "Query" => $query, + "IP" => $usingip + ) + ); + } + + /** + * SHA-256 input (short method). + */ + private static function sha256($data) { + return hash('sha256', $data); + } + + /** + * Make hostkeys for use in a lookup + */ + private static function makeHostKey($host, $usingip) { + if ($usingip) { + $hosts = array($host . "/"); + } else { + $hostparts = explode(".", $host); + if (count($hostparts) > 2) { + $backhostparts = array_reverse($hostparts); + $threeparts = array_slice($backhostparts, 0, 3); + $twoparts = array_slice($threeparts, 0, 2); + $hosts = array( + implode('.', array_reverse($threeparts)) . "/", + implode('.', array_reverse($twoparts)) . "/" + ); + } else + $hosts = array($host . "/"); + } + + // Now make key & key prefix + $returnhosts = array(); + foreach ($hosts as $value) { + $fullhash = self::sha256($value); + $returnhosts[$fullhash] = array( + "Host" => $value, + "prefix" => substr($fullhash, 0, 8), + "Hash" => $fullhash + ); + } + + return $returnhosts; + } + + /** + * Hash up a list of values from makeprefixes() (will possibly be combined into that function at a later date + */ + private static function makeHashes($prefixarray) { + if (count($prefixarray) > 0) { + $returnprefixes = array(); + foreach ($prefixarray as $value) { + $fullhash = self::sha256($value); + $returnprefixes[$fullhash] = array( + "Original" => $value, + "prefix" => substr($fullhash, 0, 8), + "Hash" => $fullhash + ); + } + return $returnprefixes; + } else + return false; + } + + /** + * Make URL prefixes for use after a hostkey check + */ + public static function makeprefixes($host, $path, $query, $usingip) { + $prefixes = array(); + + // Exact hostname in the url + $hostcombos = array(); + $hostcombos[] = $host; + if (!$usingip) { + $hostparts = explode('.', $host); + $backhostparts = array_reverse($hostparts); + if (count($backhostparts) > 5) { + $maxslice = 5; + } else { + $maxslice = count($backhostparts); + } + + $topslice = array_slice($backhostparts, 0, $maxslice); + while ($maxslice > 1) { + $hostcombos[] = implode('.', array_reverse($topslice)); + $maxslice--; + $topslice = array_slice($backhostparts, 0, $maxslice); + } + } else { + $hostcombos[] = $host; + } + + $hostcombos = array_unique($hostcombos); + $variations = array(); + if (!empty($path)) { + $pathparts = explode("/", $path); + if (count($pathparts) > 4) { + $upperlimit = 4; + } else { + $upperlimit = count($pathparts); + } + } + + foreach ($hostcombos as $key => $value) { + if (!empty($query)) { + $variations[] = $value . $path . '?' . $query; + } + + $variations[] = $value . $path; + if (!empty($path)) { + $i = 0; + $pathiparts = ""; + while ($i < $upperlimit) { + if ($i != count($pathparts) - 1) { + $pathiparts = $pathiparts . $pathparts[$i] . "/"; + } else { + $pathiparts = $pathiparts . $pathparts[$i]; + } + $variations[] = $value . $pathiparts; + $i++; + } + } + } + + $variations = array_unique($variations); + return self::makeHashes($variations); + } + + /** + * Process data provided from the response of a full-hash GSB + * request + */ + private function processFullLookup($data) { + $extracthash = array(); + + $len = strlen($data); + $offset = 0; + while ($offset < $len) { + $x = strpos($data, "\n", $offset); + $head = substr($data, $offset, $x-$offset); + $offset = $x+1; + list($listname, $addchunk, $chunklen) = explode(':', $head, 3); + + if ($chunklen > 0) { + $extracthash[$listname][$addchunk] = bin2hex(substr($data, $offset, $chunklen)); + $offset += $chunklen; + } + } + + return $extracthash; + } + + /** + * Add a full-hash key to a prefix or hostkey (the variable is $prefix + * but it could be either). + */ + private function addfullhash($prefix, $chunknum, $fullhash, $listname) { + $buildtrunk = $listname . "-a"; + + // First check hosts + $stm = $this->query("SELECT id, HEX(hostkey) hostkey, chunk_num, count, HEX(fullhash) fullhash FROM `" . $buildtrunk ."-hosts` WHERE `hostkey` = x? AND `chunk_num` = ? AND fullhash = '' LIMIT 1", array($prefix, $chunknum)); + if ($stm->rowCount() > 0) { + $row = $stm->fetch(\PDO::FETCH_ASSOC); + // We've got a live one! Insert the full hash for it + $this->query("UPDATE `" . $buildtrunk . "-hosts` SET `fullhash` = x? WHERE `id` = ?", array($fullhash, $row['id'])); + } else { + $this->query(" + UPDATE + `" . $buildtrunk ."-prefixes` p + JOIN `" . $buildtrunk . "-hosts` h ON (h.hostkey = p.hostkey) + SET + p.fullhash = x?, + h.fullhash = x? + WHERE + p.`prefix` = x? AND + p.fullhash = '' AND + h.chunk_num = ? AND + h.count > 0 + ", array($fullhash, $fullhash, $prefix, $chunknum)); + } + } + + /** + * Check database for any cached full-length hashes for a given prefix. + */ + private function cacheCheck($prefix) { + foreach ($this->usinglists as $value) { + $buildtrunk = $value . "-a"; + $stm = $this->query("SELECT id, HEX(hostkey) hostkey, chunk_num, count, HEX(fullhash) fullhash FROM `" . $buildtrunk . "-hosts` WHERE `hostkey` = x? AND `fullhash` != ''", array($prefix)); + if ($stm->rowCount() > 0) { + $row = $stm->fetch(\PDO::FETCH_ASSOC); + return array( + $row['fullhash'], + $row['chunk_num'] + ); + } + + $stm = $this->query("SELECT HEX(p.fullhash) fullhash, h.chunk_num FROM + `" . $buildtrunk . "-prefixes` p + JOIN `" . $buildtrunk . "-hosts` h ON (p.hostkey = h.hostkey) + WHERE p.`prefix` = x? AND p.`fullhash` != '' AND h.count > 0", array($prefix)); + if ($stm->rowCount() > 0) { + $row = $stm->fetch(\PDO::FETCH_ASSOC); + return array( + $row['fullhash'], + $row['chunk_num'] + ); + } + } + + return false; + } + + /** + * Do a full-hash lookup based on prefixes provided, + * returns (bool) true on a match and (bool) false on no match. + */ + private function doFullLookup($prefixes, $originals) { + // Store copy of original prefixes + $cloneprefixes = $prefixes; + + // They should really all have the same prefix size.. we'll just check one + $prefixsize = strlen($prefixes[0][0]) / 2; + $length = count($prefixes) * $prefixsize; + foreach ($prefixes as $key => $value) { + // Check cache on each iteration (we can return true earlier if we get + // a match!) + $cachechk = $this->cacheCheck($value[0]); + if ($cachechk) { + if (isset($originals[$cachechk[0]])) { + //Check from same chunk + foreach ($cloneprefixes as $nnewvalue) { + if ($nnewvalue[1] == $cachechk[1] && $value[0] == $originals[$cachechk[0]]['prefix']) { + //From same chunks + return true; + } + + } + } + } + $prefixes[$key] = pack("H*", $value[0]); + } + // No cache matches so we continue with request + $body = $prefixsize . ":" . $length . "\n" . implode("", $prefixes); + + $buildopts = array( + CURLOPT_POST => true, + CURLOPT_POSTFIELDS => $body + ); + + $url = $this->getServiceUrl('gethash'); + + $result = $this->download($url, $buildopts, "lookup"); + if ($result[0]['http_code'] == 200 && !empty($result[1])) { + // Extract hashes from response + // Loop over each list + foreach ($this->processFullLookup($result[1]) as $listname => $chunks) { + // Loop over each value in each list + foreach ($chunks as $newkey => $fullhash) { + if (isset($originals[$fullhash])) { + // Okay it matches a full-hash we have, now to check + // they're from the same chunks + foreach ($cloneprefixes as $nnewvalue) { + if ($nnewvalue[1] == $newkey && $nnewvalue[0] == $originals[$fullhash]['prefix']) { + // From same chunks + // Add full hash to database (cache) + $this->addfullhash($nnewvalue[0], $nnewvalue[1], $fullhash, $listname); + return true; + } + + } + } + } + } + return false; + } elseif ($result[0]['http_code'] == 204 && strlen($result[1]) == 0) { + // 204 Means no match + return false; + } else { + // "No No No! This just doesn't add up at all!" + $this->fatalerror("ERROR: Invalid response returned from GSB ({$result[0]['http_code']})"); + } + } + + /** + * Checks to see if a match for a prefix is found in the sub table, if it is + * then we won't do a full-hash lookup. + * Return true on match in sub list, return false on negative. + */ + private function subCheck($listname, $prefixlist, $mode) { + $buildtrunk = $listname . '-s'; + foreach ($prefixlist as $value) { + $stm = $this->query("SELECT id FROM `". $buildtrunk . "-prefixes` WHERE " . + ($mode == 'prefix' ? '`prefix`' : 'hostkey') . ' = x? AND add_chunk_num = ? LIMIT 1', array($value[0], $value[1])); + // As interpreted from Developer Guide if theres a match in + // sub list it cancels out the add listing + // we'll double check its from the same chunk just to be pedantic + if ($stm->rowCount() > 0) { + return true; + } + } + return false; + } + + /** + * query wrapper + */ + private function query($sql, $data = array()) { + $stm = $this->db->prepare($sql); + $stm->execute($data); + if ($this->debug) { + $this->debugLog[] = array($sql, $data, $stm->rowCount()); + + } + return $stm; + } + + /** + * create tables + */ + public function install() { + foreach ($this->usinglists as $listname) { + $this->query("CREATE TABLE IF NOT EXISTS `" . $listname . "-a-hosts` ( + `id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `hostkey` BINARY(4) NOT NULL, + `chunk_num` int(11) unsigned NOT NULL, + `count` int(11) unsigned NOT NULL DEFAULT '0', + `fullhash` BINARY(32) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `hostkey_2` (`hostkey`,`chunk_num`,`count`,`fullhash`), + KEY `hostkey` (`hostkey`) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); + + $this->query("CREATE TABLE IF NOT EXISTS `" . $listname . "-a-index` ( + `chunk_num` int(11) unsigned NOT NULL AUTO_INCREMENT, + `chunk_len` int(11) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`chunk_num`) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); + + $this->query("CREATE TABLE IF NOT EXISTS `" . $listname . "-a-prefixes` ( + `id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `hostkey` BINARY(4) NOT NULL, + `prefix` BINARY(4) NOT NULL, + `fullhash` BINARY(32) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `hostkey_2` (`hostkey`,`prefix`), + KEY `hostkey` (`hostkey`) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); + + $this->query("CREATE TABLE IF NOT EXISTS `" . $listname . "-s-hosts` ( + `id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `hostkey` BINARY(4) NOT NULL, + `chunk_num` int(11) unsigned NOT NULL, + `count` int(11) unsigned NOT NULL DEFAULT '0', + `fullhash` BINARY(32) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `hostkey_2` (`hostkey`,`chunk_num`,`count`,`fullhash`), + KEY `hostkey` (`hostkey`) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); + + $this->query("CREATE TABLE IF NOT EXISTS `" . $listname . "-s-index` ( + `chunk_num` int(11) unsigned NOT NULL AUTO_INCREMENT, + `chunk_len` int(11) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (`chunk_num`) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); + + $this->query("CREATE TABLE IF NOT EXISTS `" . $listname . "-s-prefixes` ( + `id` int(11) unsigned NOT NULL AUTO_INCREMENT, + `hostkey` BINARY(4) NOT NULL, + `add_chunk_num` int(11) unsigned NOT NULL, + `prefix` BINARY(4) NOT NULL, + `fullhash` BINARY(32) NOT NULL, + PRIMARY KEY (`id`), + UNIQUE KEY `hostkey_2` (`hostkey`,`add_chunk_num`,`prefix`), + KEY `hostkey` (`hostkey`) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); + } + } + + /** + * Does a full URL lookup on given lists, will check if its in database, if + * slight match there then will do a full-hash lookup on GSB, + * listname on match and (bool) false on negative. + */ + public function doLookup($url) { + $lists = $this->usinglists; + //First canonicalize the URL + $canurl = self::canonicalizeURL($url); + + //Make hostkeys + $hostkeys = self::makeHostKey($canurl['Parts']['Host'], $canurl['Parts']['IP']); + + $prefixes = self::makeprefixes($canurl['Parts']['Host'], $canurl['Parts']['Path'], $canurl['Parts']['Query'], $canurl['Parts']['IP']); + + $prefixParams = array(); + $buildprequery = array(); + foreach ($prefixes as $prefix) { + $buildprequery[] = " `prefix` = x?"; + $prefixParams[] = $prefix['prefix']; + } + $buildprequery = implode("OR", $buildprequery); + if (!empty($buildprequery)) { + $buildprequery .= ' AND'; + } + + $matches = array(); + foreach ($lists as $key => $listname) { + $buildtrunk = $listname . '-a'; + $hostsStm = $this->db->prepare('SELECT count, HEX(hostkey) hostkey, chunk_num FROM `' . $buildtrunk . '-hosts` WHERE hostkey = x?'); + + //Loop over each list + foreach ($hostkeys as $keyinner => $valueinner) { + + if ($this->debug) { + $this->debugLog[] = array('SELECT count, HEX(hostkey) hostkey, chunk_num FROM `' . $buildtrunk . '-hosts` WHERE hostkey = x?', array($valueinner['prefix']), $hostsStm->rowCount()); + } + // Within each list loop over each hostkey + $hostsStm->execute(array($valueinner['prefix'])); + + // For each hostkey match + while ($row = $hostsStm->fetch(\PDO::FETCH_ASSOC)) { + if ($row['count'] > 0) { + + // There was a match and the count is more than one so + // there are prefixes! + // Hash up a load of prefixes and create the build + // query if we haven't done so already + $params = $prefixParams; + $params[] = $row['hostkey']; + + if ($this->debug) { + $this->debugLog[] = array("SELECT FROM `" . $buildtrunk . "-prefixes` WHERE " . $buildprequery . " `hostkey` = x?", $param); + } + + // Check if there are any matching prefixes + $stm = $this->query("SELECT HEX(prefix) prefix FROM `" . $buildtrunk . "-prefixes` WHERE " . $buildprequery . " `hostkey` = x?", $params); + if ($stm->rowCount() > 0) { + // We found prefix matches + $prematches = array(); + $prelookup = array(); + while ($rowPrefix = $stm->fetch(\PDO::FETCH_ASSOC)) { + $prematches[] = array( + $rowPrefix['prefix'], + $row['chunk_num'] + ); + } + + // Before we send off any requests first check + // whether its in sub table + if (!$this->subCheck($listname, $prematches, "prefix") && + $this->doFullLookup($prematches, $prefixes)) { + return $listname; + } + } + + // If we didn't find matches then do nothing (keep + // looping till end and it'll return negative) + } elseif (!$this->subCheck($listname, array(array($row['hostkey'], $row['chunk_num'])), "hostkey") && + $this->doFullLookup(array(array($row['hostkey'], $row['chunk_num'])), $hostkeys)) { + return $listname; + } + } + } + } + return false; + } +}