Upload
dothuan
View
231
Download
1
Embed Size (px)
Citation preview
!"#$"%&'
%'
!"#$$%&'()&*+,-&./01&
Zornitsa Kozareva!USC/ISI!
Marina del Rey, [email protected]!
www.isi.edu/~kozareva!
2134-&567&5898&
()*+,'-./01'2+345.6/4.').,'78)996:3)/4.'
• ;,+./<1' *+./4.9' 6.' 0+=0' ).,' 38)996<1' 0>+*' 6.04' )'
?@+,+:.+,'9+0'4<'3)0+54@6+9'4<'6.0+@+90A'
– B+@94.'()*+9A':3;<=&>/33?&@;AABC&>/33?&@;AAB&– D@5).6E)/4.9A'@;AAB&4;3C;31D;EC'FAG&– F43)/4.9A'H-+;&– G)0+').,'/*+'+=?@+9964.9A'F/A3I13?&5898&– -H*)68A'J0KLKJ1+M=4;J&– I+J'),,@+99A'***=IB4=/NI&– ()*+9'4<',@K59A'C1314/,1J;M&– ()*+9'4<'9>6?9A'OI//E&2133?&– L6J8645@)?>63'@+<+@+.3+9A''– M'
B@4<N'O+@@1'P4JJ9'0)K5>0'7QRSS',[email protected]'T+J@K)@1'#&%&N'
O+@@1'P4JJ9'U688+,'>69',)K5>0+@'6.'D>64N'
P4JJ9'34@?4@)/4.'J4K5>0'TJVN'
WB-2X:3;<=&>/33?&@;AABW"B-2X'0)K5>0'7QRSS',[email protected]'WGYZ-XF/A3I13?&5898W"GYZ-XN''WB-2X>/33?&@;AABW"B-2X'U688+,'>69',)K5>0+@'6.'WFD7XH-+;W"FD7XN'WD2[X@;AAB&4;3C;31D;EW"D2[X'J4K5>0'WD2[XFAGW"D2[XN'
%'
!"#$"%&'
#'
(-'Q190+*'D\+@\6+]'
#'
F+)@.6.5'Y854@60>*'
!'
Z2Y;('GYZY'
Z-QZ'
GYZY'
').9]+@'
Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''
Z@)6.+,'_)3>6.+'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
!"#$"%&'
!'
F+)@.6.5'Y854@60>*'
S'
Z-QZ'
GYZY'
').9]+@'
Z@)6.+,'_)3>6.+'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
/Q1JCM/& 4M1BB&
B-2QD('
D2[Y(;`YZ;D('
B-2QD('
FD7YZ;D('
D2[Y(;`YZ;D('
FD7YZ;D('
DZP-2'
Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''Z2Y;('GYZY'
P+R/E&
F+)@.6.5'Y854@60>*'
R'
Z2Y;('GYZY'
Z-QZ'
GYZY'
').9]+@'
Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''
Z@)6.+,'_)3>6.+'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
/Q1JCM/& !1C=& +ES+4:/3& +ES+4H3K& +ES+4T;4& ':& 4M1BB&
%' %' %' &' %' B-2QD('
%' &' %' &' &' D2[Y(;`YZ;D('
%' %' &' &' %' B-2QD('
%' &' &' %' %' FD7YZ;D('
%' &' %' &' &' D2[Y(;`YZ;D('
%' %' &' %' %' FD7YZ;D('
&' &' &' &' &' DZP-2'
!"#$"%&'
S'
F+)@.6.5'Y854@60>*'
$'
Z2Y;('GYZY'
Z-QZ'
GYZY'
').9]+@'
Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''
Z@)6.+,'_)3>6.+'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
7>449+')'*)3>6.+'8+)@.6.5'38)996:+@'<@4*'I+U)'
F+)@.6.5'Y854@60>*'
a'
').9]+@'
Z@)6.+,'_)3>6.+'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
/Q1JCM/& 4M1BB&
b'
b'
b'
b'
b'
b'
Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''
P+R/E&
Z2Y;('GYZY'
Z-QZ'
GYZY'
!"#$"%&'
R'
/Q1JCM/& !1C=& +ES+4:/3& +ES+4H3K& +ES+4T;4& ':&
%' %' %' &' %'
&' &' &' %' &'
%' %' &' &' %'
%' &' &' %' %'
%' &' %' %' %'
&' %' &' &' &'
F+)@.6.5'Y854@60>*'
c'
').9]+@'
Z@)6.+,'_)3>6.+'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''Z2Y;('GYZY'
Z-QZ'
GYZY'
/Q1JCM/& !1C=& +ES+4:/3& +ES+4H3K& +ES+4T;4& ':&
%' %' %' &' %'
&' &' &' %' &'
%' %' &' &' %'
%' &' &' %' %'
%' &' %' %' %'
&' %' &' &' &'
F+)@.6.5'Y854@60>*'
d'
').9]+@'
Z@)6.+,'_)3>6.+'Z@)6.'Q+0'T+)0K@+'[+.+@)/4.'
^.6\+@9601C&C%C%C.K88C^.6C%CH)P''Z2Y;('GYZY'
Z-QZ'
GYZY'
Z+90'Q+0'T+)0K@+'[+.+@)/4.'
^Q7C&C%C%C%C&C.K88''
:3/N+4,/N&UEB*/3&
FD7YZ;D('
FD7YZ;D('
B-2QD('
D2[Y(;`YZ;D('
DZP-2'
DZP-2'
V3I/&UEB*/3&
FD7YZ;D('
DZP-2'
B-2QD('
D2[Y(;`YZ;D('
D2[Y(;`YZ;D('
DZP-2'
-\)8K)/4.'
!
Precision =# correct identified NEs
# identified NEs
!"#$"%&'
$'
(-'T+)0K@+'[+.+@)/4.'
%&'
T+)0K@+9'e%f'
• !;E,/Q,I1M''• 3K@@+.0']4@,'I&'
• ]4@,9')@4K.,'I&'6.'gH!CMCh!i']6.,4]'
• :13,W;<WBC//4-&,1K&&e]>+.')\)68)J8+f'
• H3,-;K31C-+4&XA+E13?&1EN&E;,&JI,I1MM?&/Q4MIB+R/Y&
''!"!#$%&'$() * * * *$%%&'$() * * ******** * *$%%&+!,!-)***./0$"&"1023. * * *'/"-$!")&+/-) * * * *'/"-$!")&45(43"***$'./"50* * * * *%/"3%5&!"!#$% * * * *(1"'-1$#/"&0$.6***)!",%3&'4$. * * * *71"'#/"$%&8/.+9 * * *:;<*
• .;3NWV?C/&:1Z/3EB%&
****71"'#/"$% * * *******%/83.'$)3+* * * * *=1/-3****'$(!-$%!>3+** ************* *******(1"'-1$#/"*0$.6 * ******* */-43.*
• T/[&:3/N+4D;EB&• 0>+'0)5'?@+,630+,'6.'0>+'3K@@+.0'38)996:3)/4.'<4@'IH!C'IH#C'IH%'
971"'#/"$%&8/.+*!)*(.3(/)!#/"?*'/"@1"'#/"?*$.#'%3*%%'
!"#$"%&'
a'
T+)0K@+9'e#f'
• \1KW;<W.;3NB&• ]4@,9'6.'gHRCMChRi']6.,4]'
• V3+KK/3&*;3NB]&• <4@'?+@94.'eA.B?*A!))B?*C.B?*D4CBf'• <4@'843)/4.'e'!-5?*)-.33-f'• <4@'4@5).6E)/4.'e<-+BC'E/Bf'
• P1^/Z//3B&• .)*+9'4<'36/+9C'34K.0@6+9C'\688)5+9C'90@++09'
• .)*+9'4<'4@5).6E)/4.9'
• ?+@94.':@90'.)*+'
• ?+@94.'9K@.)*+'
%#'j'?K0'+)3>'01?+'4<'0@655+@']4@,9').,'5)E+k++@9'6.'9+?)@)0+':8+9C'J+3)K9+'14K'3).'0@+)0'0>+*')9'9+?)@)0+'<+)0K@+9'
T+)0K@+9'e!f'
• F+.50>'6.']4@,9'4<'0>+'+./01'J+6.5'38)996:+,'
• B)k+@.'4<'0>+'+./01']60>'@+5)@,'04'0>+'01?+'4<'34.9/0K0+.0']4@,9'
• F;3&/14-&4M1BBB'• ]>48+'(-'69'6.'5)E+k++@'• ).1'34*?4.+.0'4<'0>+'(-')??+)@9'6.'5)E+k++@'
• "I_Q/B&e8+.50>'%'04'Sf'
• B@+\64K9']4@,'69').')@/38+'
• B@+\64K9']4@,'69')'.4K.'
%!'
!"#$"%&'
c'
7488+3/.5'-=0+@.)8'2+94K@3+9'
%S'
[)E+k++@'7488+3/4.'_+0>4,'%'
• l)54'34.0)6.9'4\+@'#'*68864.'+.//+9'e86U+'[email protected]'
4@5).6E)/4.9C'36/+9')*4.5'40>+@9f'
• G4].84),'<@4*A'
'>k?A""]]]N*?6H6.<N*?5N,+"1)54H.)5)"1)54",4].84),9N>0*8'
• -=0@)30'<@4*'0>+'@+8+\).0'@+8)/4.9')88'.)*+,'+.//+9'
'''''-=N'
– F*A;3E&+E'GC']>+@+'F'69')'?+@94.').,'G'69')'843)/4.'– F'*;30B&<;3'GC']>+@+'F'69')'?+@94.').,'G'69')'?+@94.'4@'4@5).6E)/4.'
%R'
!"#$"%&'
d'
[)E+k++@'7488+3/4.'_+0>4,'#'
%$'
B+@94.'
[)E+k++@'7488+3/4.'_+0>4,'#'
%a'
• Q0+?'%A'7>+3U'6<'6,+./:+,'(-'+=6909'6.'I6U6?+,6)'
• Q0+?'#A'-=0@)30'0>+':@90'#H!'9+.0+.3+9'
• Q0+?'!A'BK88'0>+'.4K.9'*)03>6.5'0>+'+=?@+9964.'
''''''''''''''m'69'lC'`'
' ' '''''m'69'l').,'`'
• Q0+?'SA'-=0@)30'0>+'6.<4@*)/4.'<@4*'0>+'6.<4J4='
• Q0+?'RA'n+@6<1'6.'I4@,(+0']>+0>+@'0>+'<4K.,'34.3+?09'
' ' ' ')@+'>1?4.1*9'4<'[email protected]'843)/4.C'4@5).6E)/4.'
!"#$"%&'
%&'
[)E+k++@'7488+3/4.'_+0>4,'!'
• ^9+'Q0).<4@,'()*+,'-./01'2+345.6E+@'
>k?A"".8?N90).<4@,N+,K"94o])@+"72TH(-2N9>0*8'
''''04'6,+./<1'0>+'.)*+,'+.//+9'6.'0>+'3K@@+.0',)0)'9+09N'
• ^9+'0>+'?@+,630+,'4K0?K0')9'<+)0K@+9'
%c'
%d'
!"#$"%&'
%%'
7)[email protected]'Q6*?8+'B)[email protected]'
• -=0@)30'?)[email protected]'6.']>63>'0>+'(-9'433K@@+,''''''-=N'
– O+..1p:()']4@U9pD'<4@pD';L_pH)P'NpD'– Q)*pB-2']4@U9pD'<4@pD'_63@494opD2['NpD'
– B)K8p:()'Y,)*9p:()']4@U+,pD'<4@pD'[+4@5+p:()'NpD'
– O+..1p:()'J4K5>0pD').pD'4@5).5+pD'NpD'– l)>44qpH)P'J4K5>0pD'D\+@0@K+pH)P'NpD'
• -=0@)30'\+@J9'04'0>+'8+o').,'04'0>+'@65>0'4<'0>+'(-'-=N'
– F4.,4.pFD7'+BpD'M;41,/NpD'6.pD'– O4>.pB-2'N3+E0BpD'rK63+pD'
#&'
I-VY'
I)6U)04'-.\6@4.*+.0'<4@'V.4]8+,5+'Y.)81969'
#%'
!"#$"%&'
%#'
I+U)A'G)0)'_6.6.5'Q4o])@+'
• 7488+3/4.'4<'*)3>6.+'8+)@.6.5')854@60>*9'
– 4?+.H94K@3+'?)3U)5+']@6k+.'6.'O)\)'
• ^9+,'<4@'@+9+)@3>C'+,K3)/4.').,')??863)/4.'
• _)6.'<+)0K@+9A'
– ,)0)'?@+H?@43+996.5'04489'– 8+)@.6.5')854@60>*9'
– +\)8K)/4.'*+0>4,9'
– 5@)?>63)8'6.<+@+.3+'– +.\6@4.*+.0'<4@'34*?)@6.5'8+)@.6.5')854@60>*9'
##'
I+U)A'G)0)'_6.6.5'Q4o])@+'
• 78)996:3)/4.')854@60>*9A'
– ',+36964.'0@++9C'86.+)@'38)996:+@9C'Qn_C'()6\+HJ)1+9C'U(('
• B@+,63/4.')854@60>*9A'
– @+5@+9964.'e86.+)@"Qn_f'C'?+@3+?0@4.'
• _+0)H)854@60>*9A'
– J)556.5C'J449/.5'eY,)L4490f'
)*4.5'40>+@9'
!"#$"%&'
%!'
[+s.5'Q0)@0+,'
• ;.90)88'I+U)'94o])@+'e4.'F6.K=fA'
– G4].84),'86.UA''• >k?A""?@,4].84),9N94K@3+<4@5+N.+0"]+U)"]+U)H!H$H#NE6?'• ^.E6?'0>+'94o])@+'
– 2+tK6@+*+.0A'''''O)\)'%NR'e4@'>65>+@f'
– ;.\4U+'I+U)'34**).,A'
• r)\)'H3?']+U)Nr)@'H836$&'/00$"+I*
#R'
r)\)'W`JQ98882'Hr)@']+U)Nr)@'Weka GUI Chooser
!"#$"%&'
%S'
@relation english_named_entity
@attribute position numeric @attribute pos_tag { NN, NP, VB, DT} @attribute word_length numeric @attribute in_gazetteer { no, yes} @attribute class { PER, LOC, ORG, MISC}
@data 3,DT,3,no,ORG 4,NP,10,yes,ORG 15,NP,6,yes,PER 7, NN,12,?,MISC ...
G)0)':8+'<4@*)0'eN)@uf
Other attribute types:
• String
• Date
Missing value
#$'
List of attributes (last: class variable)
Frequency and categories for the selected
attribute
Statistics about the values of the selected attribute
Classification
Filter selection
Manual attribute selection
Statistical attribute selection
Preprocessing
The Preprocessing Tab
#a'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'
!"#$"%&'
%R'
Choice of classifier
The attribute whose value is to be predicted from the values of the remaining ones.
Default is the last attribute.
Cross-validation: split the data into e.g. 10 folds and
10 times train on 9 folds and test on the remaining one
The Classification Tab
#c'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'
Choosing a classifier
#d'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'
!"#$"%&'
%a'
all other numbers can be obtained from it
different/easy class
accuracy
!#'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'
Running on Test Set
!!'Q86,+'),)?0+,'<@4*'_)@/'P+)@90'
!"#$"%&'
%c'
I-VY'
74**).,'F6.+'
!S'
I+U)'9?+36:3)/4.9'
• Z@)6.'38)996:+@'4.'0@)6.6.5',)0)').,'4K0?K0'*4,+8'
• r)\)'H3?']+U)Nr)@'H'%$))!J3.&71"'#/"I'v0'W-.$!"&J%3I''H,'H-.$!"3+&0/+3%I*
• 2K.'0@)6.+,'38)996:+@'*4,+8'4.'0+90',)0)'• r)\)'H3?']+U)Nr)@'H'%$))!J3.&71"'#/"I'vZ'W-3)-&J%3I''H8'W-.$!"3+&0/+3%I*
• Q?+36<16.5'?)@)*+0+@9A'
H0'A'0@)6.6.5':8+'eN)@uf'
HZ'A'0+90':8+'eN)@uf'
H,'A'4K0?K0':8+.)*+'e0@)6.+,'38)996:+@'*4,+8f'
H8'A'6.?K0'*4,+8'e<4@'0+9/.5f'
HV'A'.K*J+@'4<'.+)@+90'.+65>J4@9'<4@'U((')854@60>*'
&4*K*43%(*L'43'6*/1-*/-43.*($.$03-3.*/(#/")?*3-'BM*
5+.+@)8''
?)@)*+0+@9'
78)996:+@H
9?+36:3''
?)@)*+0+@9'
!"#$"%&'
%d'
-=)*?8+A'6(('6.'I+U)'
• Z@)6.')'38)996:+@'K96.5'#((')854@60>*'
• r)\)'H3?']+U)Nr)@'''''''''''''''''']+U)N38)996:+@9N8)E1N;LU''
''''''''''''H0'',)0)"]+)0>+@N)@u*
''''''''''''HV''#'
''''''''''''H,''*4,+8N#..'
• 2K.'0>+'0@)6.+,'38)996:+@'4.'0+90',)0)'• r)\)'H3?']+U)Nr)@'''''''''''''''''']+U)N38)996:+@9N8)E1N;LU''
''''''''''''HZ'',)0)"]+)0>+@N)@u*
''''''''''''H8''*4,+8N#..'
E%$))!J3.&71"'#/"*!"*836$*
N.$!"!",*J%3*O%,/.!-40*($.$03-3.*P1-(1-*0/+3%*"$03*
E%$))!J3.&71"'#/"*!"*836$*
N3)-*J%3*Q"(1-*0/+3%*"$03*
Q)*?8+'I+U)'4K0?K0'
!"#$"%&'
#&'
• 78)996:3)/4.'8)J+89'<4@'+)3>'6.90).3+'eK9+'wv?'%x'4?/4.f'
• r)\)'H3?']+U)Nr)@''']+U)N38)996:+@9N8)E1N;JU''HZ'',)0)"]+)0>+@N)@u'''H8''*4,+8N#..''H?'%'
_4@+',+0)68+,'4K0?K0'
• U((A''• G+36964.'0@++9A'• ()y\+'L)1+9A'• Y,)L4490A'''
I+U)'38)996:3)/4.'<K.3/4.9'