Skip to content Skip to sidebar Skip to footer

Membuat Web Crawler Sederhana By RieqyNS13


Kang Karding - Membuat Web Crawler Sederhana By RieqyNS13

Kita bahas langsung satu persatu gan

1. Penentu url(alamat web) tujuan.
Bagian ini akan mengambil 1 url dari database yang belum diproses.

function db_get_url()
{
$sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
$rs = mysql_query($sql);
$url = '';
if ($data = mysql_fetch_array($rs))
{
$url = $data['url'];

$sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
$rs = mysql_query($sql);
}
if ($url == '') $url = 'http://planet.terasi.net';

return $url;
}

3. Pemarsing (pemroses) hasil downloadan Fungsi parseHTML akan menerima string html kemudian mengekstrak semua link yang ada di string tersebut. Link tadi akan diambil domainnya saja untuk kemudian disimpan ke dalam database.

function parseHTML($html)
{
if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
foreach ($match[1] as $row) {
$domain = getDomain($row);
if ($domain != '')
{
db_insert_url($domain);
}
}
}
}

Berikut ini adalah source code lengkapnya.

<?php

$db = mysql_connect('localhost', 'phpkita', 'phpkita');
mysql_select_db('db_phpkita', $db);

//loop terus aja
while (true)
{
$url = db_get_url();
$html = getURL($url);
db_update_html($url, $html);
parseHTML($html);
}

mysql_close($db);
exit;

/*
* fungsi-fungsi
*/
function db_get_url()
{
$sql = "SELECT id, url FROM tbl_url WHERE status='0' ORDER BY id LIMIT 1";
$rs = mysql_query($sql);
$url = '';
if ($data = mysql_fetch_array($rs))
{
$url = $data['url'];

$sql = "UPDATE tbl_url SET status='1' WHERE id='". $data['id']. "' ";
$rs = mysql_query($sql);
}
if ($url == '') $url = 'http://planet.terasi.net';

return $url;
}

function getURL($url, $delay=0) {
$result = "";
$url = trim($url);
$delay = intval($delay);
if ($url != "") {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_MAXREDIRS, 5);
if ($delay != 0) {
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
curl_setopt($ch, CURLOPT_TIMEOUT, $delay);
}
else {
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 60);
curl_setopt($ch, CURLOPT_TIMEOUT, 60);
}
$result = curl_exec($ch);
curl_close($ch);
}

return($result);
}

function parseHTML($html)
{
if (preg_match_all("/<a href=\"(.*?)\"/i", $html, $match)) {
foreach ($match[1] as $row) {
$domain = getDomain($row);
if ($domain != '')
{
db_insert_url($domain);
}
}
}
}

function db_insert_url($url)
{
$url = mysql_real_escape_string($url);
$sql = "INSERT INTO tbl_url (url, html, status) VALUES ('$url', '', '0')";
$rs = mysql_query($sql);
}

function db_update_html($url, $html)
{
$url = mysql_real_escape_string($url);
$html = mysql_real_escape_string($html);
$sql = "UPDATE tbl_url SET html='$html' WHERE url='$url' ";
$rs = mysql_query($sql);
}

function getDomain($url)
{
$result = '';
if (preg_match("/^(http:\/\/[\d|\w|-|_|.]+)/i", $url, $match)) {
$result = $match[1];
}
return $result;
}
?>

Jangan lupa buat tabelnya juga di databasenya

CREATE TABLE `tbl_url` (
`id` int(8) NOT NULL AUTO_INCREMENT,
`url` varchar(128) NOT NULL,
`html` text NOT NULL,
`status` int(1) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`),
UNIQUE KEY `url` (`url`)
) ENGINE=MyISAM;

sumber : http://phpkita.wordpress.com

Post a Comment for "Membuat Web Crawler Sederhana By RieqyNS13"