Recognizing links in text is somehow difficult, but some applications, such as pietty, cterm, have managed to do an accurate work.
ctem uses regex in embedded python scipt. Maybe the algorithm can be used. I've translate the comments.
Code: Select all
epathch=r'[-\w\.\?\#=&\+~!%\*@\(\):,]'
pathch=r'[-\w\.\?\#=&\+~!%\*@\(\):,double]'
alnum=r'[a-zA-Z\d]'
double=r'\x80-\xff'
emailexp=r"""(mailto:)?
(alnum[-\w\.\+]{0,64})@
((alnum[-\w]{0,9}\.){1,4}(alnum\w{0,9}))(:(0|[1-9][\d]{0,5}))?"""
emailexp=emailexp.replace('alnum',alnum)
proxyexp=r"""(?P<proxyip>
(?:[1-9]\d{0,2}\.){3}
(?:[1-9]\d{0,2})
)
(?::
(?P<proxyport>0|[1-9][\d]{0,5})
)?
@(?P<proxyproto>http|socks[45]|telnet)"""
urlexp1=r"""
(?:
(?P<proto>[a-zA-Z]{3,8})
://
)?
(?:
(?P<user>alnum[-\w\.@]{0,29})
(?:
:(?P<passwd>.{1,20})
)?
@)?
(?P<domain>
(?:
(?P<company>alnum[-\w]{0,64}\.){1,5}
(?P<org>alnum\w{0,9})
)
|
(?:localhost) #none-dotted style of domain
)
(?:
:(?P<port>0|[1-9][\d]{0,5})
)?
(?P<path>
(?:(?:/pathch+\=http:/)|(?:/pathch+))*
(?P<tail>
(?:
(?:
/pathch*[double]pathch*\.(?P<cext>alnum{1,15}) #MAX_EXT=15
(?:[-\.\?\#\=&\+~!%:]epathch*)?
)
|
(?:
/epathch+
(?:
\.(?P<eext>[a-zA-Z\d\(\)]{1,15}) #MAX_EXT=15
)
(?:[-\.\?\#\=&\+~!%:]epathch*)?
)
|
(?:/epathch+)
|
(?:/)
)
)
)?"""
# special url style
calltoexp=r'callto://\w+'
# Caution: otherurl should not contain verbose quotes, if you want deal with quotes, add your code/variant in untuple of OnParseURL().
otherurl=calltoexp
urlexp='('+proxyexp+')|('+emailexp+')|('+urlexp1+')|('+otherurl+')'
urlexp=urlexp.replace('epathch',epathch)
urlexp=urlexp.replace('pathch',pathch)
urlexp=urlexp.replace('alnum',alnum)
urlexp=urlexp.replace('double',double)
urlexp=re.compile(urlexp,re.I | re.X)
nonurlchar=r'[^-\w/\.@:\?\#$=&\+~!%\*\(\),\x80-\xff]' #URL delimiters, not complete, see RFC2396
urldelim=r'(.*'+nonurlchar+r')*'
urldelim=re.compile(urldelim)
nonurlchar=re.compile(nonurlchar)
topdomain=['com', 'edu', 'gov', 'int', 'mil', 'net', 'org', 'cn', 'hk', 'tw', 'mo', 'fr', 'ru', 'uk', 'de', 'au', 'ca',
'ad', 'ae', 'af', 'ag', 'ai', 'al', 'am', 'an', 'ao', 'aq', 'ar', 'as', 'at', 'aw', 'az', 'ba', 'bb', 'bd',
'be', 'bf', 'bg', 'bh', 'bi', 'bj', 'bm', 'bn', 'bo', 'br', 'bs', 'bt', 'bv', 'bw', 'by', 'bz', 'cc', 'cf', 'cg',
'ch', 'ci', 'ck', 'cl', 'cm', 'co', 'cq', 'cr', 'cu', 'cv', 'cx', 'cy', 'cz', 'dj', 'dk', 'dm', 'do', 'dz', 'ec', 'ee',
'eg', 'eh', 'es', 'et', 'ev', 'fi', 'fj', 'fk', 'fm', 'fo', 'fr', 'ga', 'gb', 'gd', 'ge', 'gf', 'gh', 'gi', 'gl', 'gm',
'gn', 'gp', 'gr', 'gt', 'gu', 'gw', 'gy', 'hm', 'hn', 'hr', 'ht', 'hu', 'id', 'ie', 'il', 'in', 'io', 'iq', 'ir', 'is',
'it', 'jm', 'jo', 'jp', 'ke', 'kg', 'kh', 'ki', 'km', 'kn', 'kp', 'kr', 'kw', 'ky', 'kz', 'la', 'lb', 'lc', 'li', 'lk',
'lr', 'ls', 'lt', 'lu', 'lv', 'ly', 'ma', 'mc', 'md', 'mg', 'mh', 'ml', 'mm', 'mn', 'mp', 'mq', 'mr', 'ms', 'mt',
'mv', 'mw', 'mx', 'my', 'mz', 'na', 'nc', 'ne', 'nf', 'ng', 'ni', 'nl', 'no', 'np', 'nr', 'nt', 'nu', 'nz', 'om', 'pa',
'pe', 'pf', 'pg', 'ph', 'pk', 'pl', 'pm', 'pn', 'pr', 'pt', 'pw', 'py', 'qa', 're', 'ro', 'rw', 'sa', 'sb', 'sc', 'sd',
'se', 'sg', 'sh', 'si', 'sj', 'sk', 'sl', 'sm', 'sn', 'so', 'sr', 'st', 'su', 'sy', 'sz', 'tc', 'td', 'tf', 'tg', 'th',
'tj', 'tk', 'tm', 'tn', 'to', 'tp', 'tr', 'tt', 'tv', 'tz', 'ua', 'ug', 'us', 'uy', 'va', 'vc', 've', 'vg', 'vn', 'vu',
'wf', 'ws', 'ye', 'yu', 'za', 'zm', 'zr', 'zw', 'asia', 'mobi', 'name', 'biz', 'tv', 'info']
# [\x80-\xff]match CJK chars
# the rule of CJK chars is: CJK chars can appear in the middle of string
# but if CJK chars are at the end part(not '/'), a .aaa suffix is required, a could be num/alphabetic, of lenth 1~5
# decide ip
# 1 is sure a ip address: n=count=4
# 2 in question: n=count=3 if the part after it is .*, then it's a ip, else, no even a url
# 3 not ip or url: all num
# 0 url, not ip: other
def IsIP(str1):
#print 'in IsIP %s'%str1
nums=str1.split('.')
n=len(nums)
#if n<3 or n>4:
# return 0
count=0
bAllNumber=True
for num in nums:
try:
num=int(num)
if num<256 and num>=0:
count+=1
except ValueError:
bAllNumber=False
break
if count==n:
if n==4:
return 1
elif n==3:
return 2
else: # bAllNumber must is True
return 3
elif bAllNumber:
return 3
else:
# print str1, n, count
return 0
def ParseURL(str1, x, linelen, ignoreblanks=[], bAutoHeader=True):
#print 'in ParseURL'
# trim head and tail
matchobj=nonurlchar.search(str1, x)
endpos=len(str1)
if matchobj:
endpos=matchobj.start()
matchobj=urldelim.search(str1, 0, x)
startpos=0
if matchobj:
startpos=matchobj.end() # end()point to the string right after th matched string
#print 'startpos, endpos:%d,%d'%(startpos, endpos)
i=startpos
while True:
try:
matchobj=urlexp.search(str1, i, endpos)
except RuntimeError:
print 'RuntimeError'
return ()
if matchobj:
(start, end)=matchobj.span()
#print "line: '%s'"%(str1)
#print 'url range:%d-%d,x:%d'%(start, end, x)
(proxy,proxyip,proxyport,proxyproto,\
email,emailheader,emailuser,emailserver,emailcompany,emailorg,non,emailport,\
nomalurl,proto,user,pw,domain,company,org,port,\
path,tail,cext,eext,other)=matchobj.groups() #untuple
dellen=0
ext=''
if cext:
ext=cext
elif eext:
ext=eext
elif nomalurl and tail:
#for multi line url. when start with http:// at the second line, prevent false match
dotidx=tail.find('.')
if dotidx!=-1:
extstr=tail[dotidx:]
matchobj1=re.search( r'Z{1,3}http', extstr )
if matchobj1:
Zidx=matchobj1.start()
ext=extstr[1:Zidx]
dellen=len(extstr)-Zidx
#print 'dellen=', dellen
end=end-dellen
if start<=x and x<end:
#print matchobj.groups()
#print proxy,proxyip,proxyport,proxyproto
#print email,emailuser,emailserver,emailcompany,emailorg,emailport
#print nomalurl,proto,user,pw,domain,company,org,port
#print path
#print tail,cext,eext
#print other
matchstring=matchobj.group(0)
if dellen>0:
matchstring=matchstring[0:-dellen]
# trim tail space
y=x/linelen
#print 'match string: \n'+matchstring
#print x, y, start, end
blankpos=[]
#print ignoreblanks
for i,nBlank in ignoreblanks:
if nBlank>0:
blankEnd=(y+i+1)*linelen-start
blankStart=blankEnd-nBlank
#print blankStart, blankEnd
if blankStart==0:
start+=nBlank
matchstring=matchstring[nBlank:]
elif blankStart>0:
blankpos.append((blankStart,blankEnd))
newmatchstring=''
x0=0
#print blankpos
for blankStart,blankEnd in blankpos:
newmatchstring+=matchstring[x0:blankStart]
x0=blankEnd
newmatchstring+=matchstring[x0:]
matchstring=newmatchstring
bIP=0
ip=''
if nomalurl and domain:
tmp=domain.count('.')
if tmp==1:
if not org.lower() in topdomain:
#print str1[end]
#210.43.]
if len(str1) > end and str1[end]=='.':
tstr = matchstring + '.0.0'
if IsIP(tstr)==1:
bIP=4
ip=tstr
if not bIP:
return () #exclude most file names
elif tmp==0: #http://localhost/index.htm
if not proto: #re, cr,
return ()
elif emailorg:
if not emailorg.lower() in topdomain:
return ()
bPicture=0
type=0
if bIP!=4:
if proxy:
bIP=IsIP(proxyip)
if bIP!=1:
return ()
type=2
ip=proxyip
elif email:
bIP=IsIP(emailserver)
if bIP==1:
ip=emailserver
elif bIP!=0:
return ()
type=1
if bAutoHeader and not emailheader:
matchstring='mailto:'+matchstring
elif nomalurl:
bIP=IsIP(domain)
if bIP==2: # 202.118.1.*
idx=matchobj.span('domain')[1] #domain's end index
if str1[idx]=='.' and str1[idx+1]=='*':
domain+='.0'
matchstring+='.0'
end+=2
bIP=1
else:
return () #not url, callback function will return false
elif bIP==3:
return ()
if bIP:
ip=domain
else:
if matchstring.find('/')==-1:
if not org.lower() in topdomain:
#support alphabetic char in front of IP,such as a10.8.0.2
#kdfalkdfalkdf211.99.222.0
if matchstring[-1]=='*' or re.match('[0-9]', matchstring[-1]):
matchobj=re.search('[0-9]', matchstring)
if matchobj:
numpos=matchobj.start()
if IsIP( matchstring[numpos:] )==1:
bIP=1
matchstring = matchstring[numpos:]
ip=matchstring
(domain,company,org) = ('','','')
start = start + numpos
if not bIP:
return () #exclude multi level filenames and program language
if nomalurl and bAutoHeader and not proto: # add protocol name
if not port or port not in ('80', '21' ,'22', '23', '8080'):
if domain:
if domain[0:3]=='www':
matchstring='http://'+matchstring
elif domain[0:3]=='ftp':
matchstring='ftp://'+matchstring
elif domain[0:3]=='bbs':
matchstring='telnet://'+matchstring
elif port=='80' or port=='8080':
matchstring='http://'+matchstring
elif port=='21':
matchstring='ftp://'+matchstring
elif port=='23' or port=='22':
matchstring='telnet://'+matchstring
if re.match(r"(?i)gif|jpg|jpeg|jpe|bmp|png|ani|tif|tiff|ico|jfif", ext):
bPicture=1
elif re.match(r'http://.*/(.*\.(?i)(gif|jpg|jpeg|jpe|bmp|png|ani|tif|tiff|jfif|ico).*|.*\.php\?bid=\d+&id=\d+&ap=\d+)', matchstring):
bPicture=1
#elif re.match(r'(?i)http://bbs.whnet.edu.cn/upload/.*B=Picture', matchstring):
#elif re.match(r'(?i).*Picture.*',matchstring):
# bPicture=1
offset0,offset1=start-x,end-x
if bIP==4:
bIP=1
ret=(matchstring, offset0, offset1, type, bIP, ip, ext, bPicture)
return ret
i=end
else:
break
return ()
pietty(
http://ntu.csie.org/~piaip/pietty/)has build in link recognize, but I haven't read the code yet...somewhat it's because putty+hack code is a mess...
src is here:
http://ntu.csie.org/~piaip/pietty/archi ... 24_src.rar . It's in MIT license so I think it will be ok to use the code.