Quantcast
Channel: AutoIt v3 - General Help and Support
Viewing all articles
Browse latest Browse all 12506

Help updating a web crawler

$
0
0
<p>Ok, so i found this script and i have some questions
[ autoit ]         
; ---------------------------------------------------------------------------- ; ; AutoIt Version: 3.1.1.87 ; Author:        AcidicChip <acidicchip@acidicchip.com> ; ; Script Name:  Web Media Spider ; Script Version: 0.21 ; ; Script Function: ;   Spider the web and gather media file URLs ; ; ---------------------------------------------------------------------------- Opt("GUIOnEventMode", 1) Opt("TrayIconDebug", 1) #include <Array.au3> #include <GUIConstants.au3> Dim $collected[1] Dim $urls[1] Dim $urlon = 0 Dim $urlnum = 0 Dim $imagenum = 0 Dim $audionum = 0 Dim $videonum = 0 #region "GUI" GUICreate("Media Spider", 600, 100) $lblAction = GUICtrlCreateLabel("Action:", 0, 3, 35, 20) $txtAction = GUICtrlCreateInput("", 40, 0, 560, 20) GUICtrlSetState($txtAction, $GUI_DISABLE) $lblURL = GUICtrlCreateLabel("URL:", 0, 23, 35, 20) $txtURL = GUICtrlCreateInput("", 40, 20, 560, 20) GUICtrlSetState($txtURL, $GUI_DISABLE) $prgPercent = GUICtrlCreateProgress(0, 40, 560, 20) $txtPercent = GUICtrlCreateInput("0%", 560, 40, 40, 20) GUICtrlSetState($txtPercent, $GUI_DISABLE) $lblURLs = GUICtrlCreateLabel("URLs:", 0, 63, 35, 20) $txtURLs = GUICtrlCreateInput("0", 40, 60, 75, 20) GUICtrlSetState($txtURLs, $GUI_DISABLE) $lblAudio = GUICtrlCreateLabel("Audio:", 125, 63, 35, 20) $txtAudio = GUICtrlCreateInput("0", 160, 60, 75, 20) GUICtrlSetState($txtAudio, $GUI_DISABLE) $lblImages = GUICtrlCreateLabel("Images:", 245, 63, 36, 20) $txtImages = GUICtrlCreateInput("0", 285, 60, 75, 20) GUICtrlSetState($txtImages, $GUI_DISABLE) $lblVideos = GUICtrlCreateLabel("Videos:", 370, 63, 35, 20) $txtVideos = GUICtrlCreateInput("0", 410, 60, 75, 20) GUICtrlSetState($txtVideos, $GUI_DISABLE) $lblHistory = GUICtrlCreateLabel("History:", 490, 63, 35, 20) $txtHistory = GUICtrlCreateInput("0", 530, 60, 75, 20) GUICtrlSetState($txtHistory, $GUI_DISABLE) $lblStartURL = GUICtrlCreateLabel("Start URL:", 0, 83, 50, 20) $txtStartURL = GUICtrlCreateInput("http://www.myspace.com/acidicchip", 55, 80, 490, 20) $btnStartStop = GUICtrlCreateButton("Start", 550, 80, 50, 20) GUISetState(@SW_SHOW) GUISetOnEvent($GUI_EVENT_CLOSE, "GUIClose") GUICtrlSetOnEvent($btnStartStop, "GUIStartStop") #endregion "GUI" Func GUIClose()     Exit EndFunc ;==>GUIClose Func GUIStartStop()     If GUICtrlRead($btnStartStop) == "Start" Then         GUICtrlSetData($btnStartStop, "Stop")         GUICtrlSetState($txtStartURL, $GUI_DISABLE)         FileDelete("spider.urls.txt")         GetURLs(GUICtrlRead($txtStartURL))         Do         ;$url = $urls[1]             $urlon = $urlon + 1             $url = FileReadLine("spider.urls.txt", $urlon)         ;_ArrayDelete($urls, 1)             $urlnum = $urlnum - 1             GetURLs($url)         Until $urlnum <= 0 Or GUICtrlRead($btnStartStop) == "Start"     ;Until UBound($urls) <= 1 Or GUICtrlRead($btnStartStop) == "Start"     Else         GUICtrlSetData($btnStartStop, "Start")         GUICtrlSetState($txtStartURL, $GUI_ENABLE)     EndIf EndFunc ;==>GUIStartStop While 1     Sleep(250) Wend Func Status($action, $url, $percent)     GUICtrlSetData($txtAction, $action)     If $url <> "" Then GUICtrlSetData($txtURL, $url)     GUICtrlSetData($prgPercent, $percent)     GUICtrlSetData($txtPercent, $percent & "%")         GUICtrlSetData($txtURLs, $urlnum) ;GUICtrlSetData($txtURLs, UBound($urls))     GUICtrlSetData($txtAudio, $audionum)     GUICtrlSetData($txtImages, $imagenum)     GUICtrlSetData($txtVideos, $videonum)     GUICtrlSetData($txtHistory, UBound($collected)) EndFunc ;==>Status Func _ArrayParse($str, $before, $after)     Return StringRegExp($str, "(?i)" & $before & "(.*?)" & $after, 3) EndFunc ;==>_ArrayParse Func AddURL($url)     If Not WasCollected($url) Then         _ArrayAdd($collected, $url)     ;_ArrayAdd($urls, $url)         FileWriteLine("spider.urls.txt", $url)         $urlnum = $urlnum + 1     EndIf EndFunc ;==>AddURL Func WasCollected($url)     $return = False     For $i = 1 To Ubound($collected) - 1 Step 1         If $collected[$i] == $url Then             $return = True             ExitLoop         EndIf     Next     If Not $return And UBound($collected) >= 1024 Then _ArrayDelete($collected, 1)     Return $return EndFunc ;==>WasCollected Func GetURI($url)     $uri = StringMid($url, 1, StringInStr($url, "://")) & "//"     $turl = StringMid($url, StringLen($uri) + 1)     If StringInStr($turl, "?") Then         $temp = StringSplit($turl, "?")         $turl = $temp[1]         $temp = StringSplit($turl, "/")         $uri = $uri & $temp[1] & "/"         For $i = 2 To UBound($temp) - 1 Step 1             If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop             $uri = $uri & $temp[$i] & "/"         Next         If Not InetGetSize(StringLeft($uri, StringLen($uri) - 1)) Then             $uri = StringMid($url, 1, StringInStr($url, "://")) & "//"             $temp = StringSplit($turl, "?")             $turl = $temp[1]             $temp = StringSplit($turl, "/")             $uri = $uri & $temp[1] & "/"             For $i = 2 To UBound($temp) - 2 Step 1                 If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop                 $uri = $uri & $temp[$i] & "/"             Next         EndIf     Else         $temp = StringSplit($turl, "/")         $uri = $uri & $temp[1] & "/"         For $i = 2 To UBound($temp) - 1 Step 1             If StringInStr($temp[$i], ".") Or Not StringLen($temp[$i]) Then ExitLoop             $uri = $uri & $temp[$i] & "/"         Next     EndIf         Return $uri EndFunc ;==>GetURI Func GetURLs($url)     $uri = GetURI($url)         $file = "spider.html.txt"     Status("Downloading", $url, 0)     $filesize = InetGetSize($url)     $lastsize = 0     $strikes = 0     InetGet($url, $file, 1, 1)     While @InetGetActive         If $lastsize == @InetGetBytesRead Then $strikes = $strikes + 1         If $strikes >= 30 Then ExitLoop         $lastsize = @InetGetBytesRead         Status("Downloading", $url, Round(($lastsize / $filesize) * 100))         Sleep(250)     Wend     $html = FileRead($file, FileGetSize($file))     FileDelete($file)         Status("Parsing URLs", $url, 0)     $tags = _ArrayParse($html, "<a", ">")     For $i = 0 To UBound($tags) - 1 Step 1         Status("Checking <A> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))         CheckURL($uri, $tags[$i], $url)     Next     $tags = _ArrayParse($html, "<img", ">")     For $i = 0 To UBound($tags) - 1 Step 1         Status("Checking <IMG> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))         CheckURL($uri, $tags[$i], $url)     Next     $tags = _ArrayParse($html, "<embed", ">")     For $i = 0 To UBound($tags) - 1 Step 1         Status("Checking <EMBED> Tags for URLs", $url, Round(($i / (UBound($tags) - 1)) * 100))         CheckURL($uri, $tags[$i], $url)     Next EndFunc ;==>GetURLs Func CheckURL($uri, $str, $ref)     If StringInStr($str, "href=") Then         $turl = GetAttr($str, "href=")         If Not StringInStr(StringLeft($turl, 10), "://") Then             If StringLeft($turl, 1) == "/" Then                 $turl = $uri & StringMid($turl, 2)             Else                 $turl = $uri & $turl             EndIf         EndIf         CheckType($turl, $ref)     EndIf     If StringInStr($str, "src=") Then         $turl = GetAttr($str, "src=")         If Not StringInStr(StringLeft($turl, 10), "://") Then             If StringLeft($turl, 1) == "/" Then                 $turl = $uri & StringMid($turl, 2)             Else                 $turl = $uri & $turl             EndIf         EndIf         CheckType($turl, $ref)     EndIf EndFunc ;==>CheckURL Func GetAttr($str, $attr)     If StringInStr($str, $attr & '"') Then         $temp = _ArrayParse($str, $attr & '"', '"')         If UBound($temp) == 1 Then Return $temp[0]     ElseIf StringInStr($str, $attr & "'") Then         $temp = _ArrayParse($str, $attr & "'", "'")         If UBound($temp) == 1 Then Return $temp[0]     ElseIf StringInStr($str, $attr) Then         $temp = StringMid($str, StringInStr($str, $attr) + StringLen($attr))         If StringInStr($temp, " ") Then             $temp = StringMid($temp, 1, StringInStr($temp, " ") - 1)         EndIf         Return $temp     EndIf EndFunc ;==>GetAttr Func CheckType($url, $ref)     If StringRight($url, 4) == ".jpg" Or _             StringRight($url, 4) == ".gif" Or _             StringRight($url, 4) == ".png" Or _             StringRight($url, 4) == "bmp" Then                 FileWriteLine("spider.images.log", $url & @TAB & $ref)         $imagenum = $imagenum + 1     ElseIf StringRight($url, 4) == ".mp3" Or _             StringRight($url, 4) == ".rbs" Then                 FileWriteLine("spider.audio.log", $url & @TAB & $ref)         $audionum = $audionum + 1         AddURL(GetURI($url))     ElseIf StringRight($url, 4) == ".avi" Or _             StringRight($url, 4) == ".wmv" Or _             StringRight($url, 4) == ".mpg" Or _             StringRight($url, 5) == ".mpeg" Then                 FileWriteLine("spider.video.log", $url & @TAB & $ref)         $videonum = $videonum + 1         AddURL(GetURI($url))     ElseIf StringRight($url, 4) == ".exe" Or _             StringRight($url, 4) == ".zip" Or _             StringRight($url, 4) == ".rar" Or _             StringRight($url, 4) == ".tar" Then             ;Do Nothing     Else         AddURL($url)     EndIf EndFunc ;==>CheckType

As you see it saves the .mp3 url(URL1) after that it tabs and saves the URL of the page (URL2). Is it possible to save the Page Title istead of URL2 ?

Viewing all articles
Browse latest Browse all 12506

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>