UrlExtractor := Object clone do( schemes := "(http|ftp|https|news|irc|mailto)" guesses := "(www|ftp)" body := """[a-z0-9_\-+\\/:?%.&!~;,=\#<>]""" ip := """[0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}[.][0-9]{1,3}""" dn := """([a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?\.)*[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?""" newSlot("tlds") tlds := "ac ad ae af ag ai al am an ao aq ar as at au aw az ax ba bb bd be bf bg bh bi bj bm bn bo br bs bt bv bw by bz ca cc cd cf cg ch ci ck cl cm cn co cr cs cu cv cx cy cz de dj dk dm do dz ec ee eg eh er es et fi fj fk fm fo fr ga gb gd ge gf gg gh gi gl gm gn gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il im in io iq ir is it je jm jo jp ke kg kh ki km kn kp kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md mg mh mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nc ne nf ng ni nl no np nr nu nz om pa pe pf pg ph pk pl pm pn pr ps pt pw py qa re ro ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr st sv sy sz tc td tf tg th tj tk tl tm tn to tp tr tt tv tw tz ua ug uk um us uy uz va vc ve vg vi vn vu wf ws ye yt yu za zm zw arpa com edu gov int mil net org biz info name pro aero coop museum" split full := Regex clone setPattern(schemes .. "://" .. body .. "+") partial := Regex clone setPattern(guesses .. "[.]" .. body .. "+") guess := Regex clone setPattern(body .. "+") dnre := Regex clone setPattern(dn) ipre := Regex clone setPattern(ip) extractUrls := method(text, result := list full setString(text) eachMatch(m, result append(m string)) guess setString(text) eachMatch(m, value := m string if(result contains(value) not and(containsIp(value) or containsDomainName(value)), prefixed := if(value beginsWithSeq("ftp"), "ftp", "http") .. "://" .. value result append(prefixed) ) ) result ) containsIp := method(candidate, domain := guessDomain(candidate) result := false ipre setString(candidate) eachMatch(m, result = result or(m string == domain) ) result ) containsDomainName := method(candidate, domain := guessDomain(candidate) result := false dnre setString(domain) eachMatch(m, subdomains := m string split(".") tld := subdomains last result = result or(subdomains size >(1) and(tlds contains(tld))) ) result ) guessDomain := method(candidate, i := candidate findSeq("://") i = if(i == nil, 0, i+3) j := candidate findSeq("/", i) if(j, candidate slice(i, j), candidate) ) )