© . . © . . ©. .
-
boyarin9@yandex.ru natfed@list.ru graziokisa@gmail.com
© . !.
-
kanev@emi.nw.ru Assoul@yandex.ru
" # $% &
$ # ' # # (
#% '")# ()# #.
( *$ #'" +) $'"+)
#& , +), $ - # ,
$ (# '-#) $ '0()
$'''") )) &#)
#& +), ', 1 ,
$& '"), $-. (
$ ' # ) TextTiling, $'"+02 ''"0 &#% 0 + #1- - # (# . '- ' ( #%
, $#, +# '"+2 , ' ( $) #1- # ,
$) ". $-') $ #'")
# % $# #% -' +' (), +). $ # +)
$-1-, ( $-'0( 3 , ' ( , ($-', ' & , +) 2 $)3 (
#% .
1
6&& " '-'" )
## $-$-' $' , '), &#% ), ( .
'"30 -'0 , ( '0 ), 2 1 ( $+), #& , ( , () "
. -. 6, $ ', '"3 , &#% -)2) -#) +) ', '-02 , -' ( [16, 26, 30]. *# $) <# $ # ' (1 – 2 #& ), ( $+' $ #" -' , ' + ( #-), ' +#2 ,', ) . $.
# # - $
$-' 0 -$ $ ()# ()#
# (+ , - *&& "
&#% $ # , -# ). ' #, ( , -' -
$- # - % )# -'
$ )# # – $ #, ##
' '0()# '# . +'"
$ ) +$ $'"+'" $'(
' -# %' #, # )1- (0 2'" ' ) $ , ' )) + %)
# #'" ( (# '0(), ', $ )# '1 " %'"
$-' # , 1-# -#.
# +#, ,- # +%
&#% $ ( , (, 0 (-", 3 +-(
# + -' -#
$-#) (# , %' $'"+'. 6 +-(
XVII
DAMDID/RCDL’2015 «
!" », #, 13-16 2015
'" $ $ (# ( , $ ( # ), +),, $ )# $1 $'#
'- . $.
2# # ' $ -( '"3 ( ' ' #
# ( #% ( , + $ - 1). -, $ ', -# 0 -(0 *&& "
+-), ,, $-'02 , % 0 -'"), $-'1 ' , ), &# + +), ' -$' % (#., $ #, [3, 5, 9], ' '"3 , ), $, [4, 7, 21]). 1
# +'") $ # * , ' # '")# ()# # $-') - ( $ ( - - (.,
$ #, [5, 9, 11]).
" # $% &
$ # ' # # (
#% '")# ()# #.
( *$ #'" +) $'"+)
#& , +), $ - # ,
$ (# '-#) $ '0()
$'''") )) $ $-'1 #
&#) #& +), ', 1 , $& '"), $-.
'- + #" ( #%
, $#, +#
'"+2 , ' ( $) #1-
# , $) ", +) , ' ( , - % -' ' +, 1
$-'0( 3 , ' ( , . #% 1 -' )-' +# (', $&)
&# -' -' $- ' - -'" ) #.
2 $
( ' #) # (
#% + 0 +
$'- ,. $+ '- , [27, 31], (), , $'-0 $)
+ , ' (
('0), # ( +# '")) ' - ( ($'''")):
x '0) $ + – () $
# #1), $-'1 , ; x # ( $ + – $'"+
# , ' + , $ #)', -#
;
x +# '") $ + – $'"+
# # ' # +'")#
# # # -' +#2 #)'
$-302 $-'1 ;
x $'''") $ + – ' (
$-'1 , )02 , + , ( # $ +# ), '0 $''' +# ), 1
- &# )1 +#),.
'"3 202 , ' # )' + $ (1). !' #) * $ #1 +-' "
- $$). !' #) $ $$) $'"+0 ''"0 &#% 0 + #1- - # (# . - + ' +), ' # *
$$) – TextTiling [11, 12,] – + '-02 , 3:
x )$' '## +% -'
$-'; # +#, $+
$'-'"" + N;
x $'-'") ) $$ 0
$-$-'1 -' W 1-;
+ , &# ' +## k
$-$-'1 , ) -'"3#
$'"+ '"+2 3# s
$-$-'1 . +# ' # s=1, .. 0 (' $$ 0- - (W*k)- c $$ W- - (W*(k +1))-, +# $$ W- - (W*(k + 1))- $$ (W*2)- - (W*(k+2))- . -. - $$- $'-
$-$-'1 0 %;
x % ' ( ,1" - , ' ' M #1- (W*k)-
#)# # :
x , (1)
x - wn,i– n- i-# '.
x ''") # ##) &% (1)
# 0 %) #1-
## ( ' # - ' 13 $-'1 ' +%).
!' #) $$) % 0
$-' $02 , $ # . , ' # DotPlotting [20] $'"+
-' ' + ' ( + -#)
& , # $ # X Y'-)
$'1 ; ' $+ % , x y ,- - () ),
& ( $+ % , (x, y) (y,x).
*# +) #) +'"
0 -#, $'1)# -'"
- ' , ) $'"0 (.
'( $-' '-
*## $ # # - + -, : ' # # +% $' ( %,, ' # # +% $' ( #. C+ # * - '
' # 99 [4], # ' ()# +#
+' + # ' ( ,1
#1- # - , &#, +#
#-# - # ( $##
,- +) # #'" $'"0 *
#).
, 1 ,
2 2
, 1 ,
cos , 0 cos 1
n i n i n
i
n i n i
n n
w w
w w
M
¦
d M d¦ ¦
' % 1. D ), ( +(
(
' & ( $ ( E+) (
C+# &#
($(. +) 1. Romme L' Art de la Marine, u Principes t Préceptes Generaux d
l'Art de Construire, d'Armer , de Manœuvrer et de Conduire ds Vasseaux, par . R##e. La Rochelle, 1787. Chapitres VII, VIII.
&%+ 110261
2. C## , ' ') (' $ ', (02 , 1 , $' 1- ', ( ) $- # C###,
$-# 1 !-#
$&# % # ( ' 2. I" 1, 2.
' &%+ &' $ !'- J 3. -, 1793. ') 7, 8.
161152
3. U-boat Williamson G., Johnson L.: U-Boat crew 1914-45/
ed. Osprey Publishing, Great Britain, 1995.
' 60563
4. -- --) &' # . 1914–1945 / . '"#;
. '. .!. '"%. – .: «+-'"
!», 2003.
60560
5. ) ( – 23800
-'1 %') - #- & % +), ' # (#., $ #, [5, 6, 8, 13, 17]),
$+'02 ,, $# # '0 + , ' # (" - $) + ,
$ 02 . $ #, -' ( + $ (2) $-' $-'0(" 3 ' ($ #, WordNet) ' "
0 # *&& % , 102 ( (# ' 3 ''% -# ($ #, ) [5]. ' ( + $ (4) $-' -# $'"+" $ # + ##
&% ' DotPlotting #) ' ( ,- # #1- ##
[25].
!' + #) ), #1 )-' " - $'#, $$02 , , - 0 '"0 $ $ ), #. -$),, '"3 #-,
$-'), ' , + -' ' +). +)) $$)
$ # '" +-( # (
#% +() ' [3, 24]. -),, -' # , +), '0( , -#
-$ 0 -( $') ' # - ) ' ( ). , -" ,, *&& " ,
$-'1), ' # ( + + + $#
+ )# .
$'- # $'( ' + ' #) , ( #%
(#., $ #, [7, 14, 21]). + '"3 + , '1 #-'" ' ( , + ' - ## $-' '
$ ##, $ (# # #"
'"3 ' ( #, $' 1- ' +) - + #.
# (0 -' $-,-
$-' ' +#2 ,' (LDA) [2], 3 $'"+# #3 # ( . $ #, [7] LDA ' + , ( ' #,
$+'02 ) " - '
#% . !' # TopicTiling [21],
' ( TextTiling, )( '
#) ,- #1- - #
## , - $'"+0 () (# ', () - & #, $- '"
( ), -' 1- '
$#2"0 LDA.
!' #) * $ $+)0 # '(30 *&& ", (# '"3 (02
$ (# ' 1 (
$-' ' + ## . -'
" (02 $ ' + #) -'1) $ -'1" -# -# [21], ( - 1 -" -' '" ( ,
%" ## +) '""0.
2 " $- )
*$ #'" % $#
#% , 1), ( + +%
# ( #% '"), (),
. C# +) –
' #% # ,-
#1- ## .
C . 1. ! +# () # ( 1) %) # (H)
$ -, $, $- -' «Romme»
3 # " %
3.1 # ! ) +'' " + #& , (
# , +), ( , ' ,
&%+ ) $ - # . D ( $-') '. 1. '-#)
$ 3' $'''") &#) #&
+) ' ($+. 1, 3), 1 ,
$& '"), $- ($+. 2, 4).
$-,- # $'# - ( -# -' +), $+' ( # - '-" +)) $#
#% . I( +#
' # &%+# +), )$''"
$#2"0 OpenXerox [18],
# +) – $#2"0 (
' + SemSin [29]. $ $-' -'
1- )' &# (0 ( ' +. ( $ )' &# , 2 + 20 ),
&#, +), '()# +# + ($+. 5 '. 1). $-
#% 2''" $- +' ()# #-# ('. 2).
' % 2. ) $- +( $
N
'## +% + -' $-' N+ '## +% + (( +# +
2 '"),
N+++ '## +% + (( +# + 2 '"),, $ ''"), ''
N++' '## +% + (( +# +
$-'0( 3 ' &
3.2 !"
( + ' # )'
) TextTiling, ' $+(
' + #, +# + 0 #.
'-'"" 3 ' # +(
$# $- $ ) )3 (#. $. 2).
,- *$ # " ' " '-02
$#) ' #: +# ' W*k[];
+# $) #1- '# s*k [].
# , - *$ # $'"+) ' $# -' ), 02 -' +%.
3.3
' % ( #%
$-'1 - # , # ( ' 3 recall-precision [12], - [19],
#) PP Pk [1], # WindowDiff. 1- + , - -.
$ #, # WindowDiff
$'1 &), % #,
&# ), $ + ' , %, $'(), % #)#
' ##, $-', '"+2 . R#
( ' , -12 , 3 ( $-') %), # 2 ( ' . -
$'-02 '- [10, 15, 22] ) ' - - # WindowDiff. -
% '1 $-') (false positive, FP)
$$2) (false negative, FN) %), ) -'1) #" +0 +( #" + # +-( #% . # ,
# WindowDiff $+' (" ' ( 3 $-' %), 1 $ - +)3 +( 3 # (' % . + * # $-'1 - #- & %
# WindowDiff (#., $ #, [22]).
3# '- -' % (
#% $'"+'" ' F-# [28]:
0 0,2 0,4 0,6 0,8
cos M 1
+ +
A
R P
R F P
2
, (2)-
P TP
H
– (",TP FN R TP
–$', FP – ' ( '1), +( , FN – ' ( $$2), +( , H – ( ' -), % #.
( *' -' #) +'
*$0 +# % #) # (A), #.
. 1. #), &#, , % 1 # - - .
- $# #% $- ' '-02 # +#. !' + '" +(
#) (1) + # " #
$ z. R( cos M, #"3 '
) z, # ' " # #), . .
% # (H). R# $- '"
$' $'(), $'-'"
. $ '" $- (TP) # ' "
'( , - , $'-'", (A H) )' #() ''") # ##) («$')») -' - 1 +% ' -' - , +%. ' $-' )#, $'", $+' -'") ' +, ( $# +% #)
$) $' # " #, , 2 ,
$,- $ ,- '" '-02# +%.
'") «$')» H #(' " '1)
(FP), «$')» A – $$2) (FN). '
$-' '(3 +( F-#) "
$ z" ' 0 - 1.
# #, ( #) )3 #
% 0 ( #% '"
$&# $( 0 +#1"
($ # +% ) $# ' #
#% , # -' +), +-(
+ 3 ( $'), ( 1 -' (), . 6
$'1 ' 2 - # ## $'"+ ) ( # F-#) (2), -
# " $ # +% ,
# ## – '(3 3 P R.
' #- + : , $ #, ' " ) *&& % ) #1- P R.
4 & ! "!
4.1 "
( $ #'"), $#
#% +# ' #
TextTiling [11] #- '"+2
(') +## (W*k) + k
$-$-'1 -' W 1-, $
$) ' s=(W*k)/2. - 3
*$ #) $'"+ # +),
# % s, W k$+' , ( '0) +(
$) , ' () ', 10
" & , $'), - % 02 %) #)'), &#, $ ( (+0 (,) $ # $ - . 2).
# #, ( * +'" $- )-#, -')# *$ #'"# '- [5].
*# 3 , -'"3 , *$ #,
$) ' $'"+'".
C . 2. R( #) -' «U-boat» c
$- $ N+: –W*k=10, s=0;
–(W*k)=20, s=(W*k)/2;
4.2 "
#('" )3, +# ' # TextTiling [11, 12] ( - %) ' + $'"+) & -' ),
$'1 $'(02 , % ' - ' 13 $-'1 ' +%. 3
*$ #) $+' , ( $ # '"
'")# ()# # ( ' + *#
'( -' '" ('. 3). 6 +
#, ( $ # (10 2 '"),) '() +# ' $ -
$' 0 '"3 ( ' «'1), »,
#"3 (", $ '"3# (40 2.) ' " , ( $ '"0 %, ( $ - $- 0
$'). # +'" -' ( «U- boat» $- $ N+ $ - ' % 3, ) 1–3.
' % 3. ' +# '
S 1 2 3 4
C+# ' [] 10 25 40 +%
F-# 0.06 0.04 0.03 0.17
#) 1 '-'
$-$'1 , ( #)') %) '- " %, +%. 1 +# ", ( ' $-') $ ( ) #
$ *# $-. $ #, [32] :
«...#1 +", ( +% (# –
#'"), & ( )-') *'#
, -12 - +0 #)'" '
&#». - ), ' [23], -' $ &#'")# '02 # (+%# ' +-'#) $+' )-' "
$-#) -#: %) +%
# ( , ' # $-", -' +%) + $ +( -#
($ #, ,-1)), '"3 +%) # -1" '"
$-#.
' *$ # #) $'"+'
$# -' ), $-02 +%#.
' 3# +%) $ - ' "
'-02#, -( '"3
"0 +%) $-'0 +' ' $' , ), 2 ', $-02 , '#
102 .
C+'") ' + -# 0 + ' ( +( F *# '( $ 0 & -' ' ('. 3, 4).
*# -'"3# -' , $'"+'" (' $ +%#.
4.3
$-'02 # $## -' $ 3 $- %) # -#
# ' " $ z. #
#"3 (" P, . . $'0 $$
%, $' R, . . '"3 ( ' % 1 . # #, 2
$ #'") " $ ( + F-#) ( . 2). $+' 3 *$ #),
$ #'" +( z '" + $
$- . ' ,
$- , # N++' ('. 2),
# #'") +( F-#) - 0 $ z=0.1…0.15. $-'0( # -$' '"), (N++'), '
$- $ +-' 4.5, # #'"
+( F-#) - $ ܟ'"3 ,
+( , z( . 3).
C . 3. R #" P, R F-#) $ z
4.4 "
# )' '-) +#1
$ # +% $- + # +) . ( -, ( $ $-
& ( $-' $ ' -' ' + +), +),, #1 $-')# +) $#
+ . ( , N+ N+ ( '0( # ' (), ') & 0 '0) $ + . N+++ ( ' -) $, ( $-'
# 0 '" '1) ( ( ), - '0)#, ( ( &
$'''") $ + .
' + #), ,, $# # -),
$-', '0(' " ' () -1'") ': – , ,, U-boat,jacket,war– ' ,, voile ($), poulie ('), mat (#(), fig (& , )' ), vergue() – &%+ ,.
R( F-#) -' $ #'" $
, $'() -' +' (),
$- «$ '#» (.. -' ,
$- , # N++'), $ -) '. 4. #'") +( )-')
$'1 )# 3 &#. # #, ( $'() +( $' 0 +'"# , , [3, 5].
' % 4.
N+ N+
-- 0.19 0.21 0.11
U-boat 0.24 0.17 0.19
Romme 0.46 0.44 0.53
C## 0.21 0.22 0.18
) 0.60
!' + ' %) $+), ( (), , -' +'#
$-+'# +( #) '
$', '' $ # (# ' +.
+) « +)#» ( + $'"+# ' . 1 # $ '" '"3 , ), -1 + $ # ,-' -$' '"),
# ( -' +( '" )3.
6 +'") ' $ $ "
$#2"0 - ## #) ,-
#1- +%# $ # ( «Romme»
( . 1).
$'3)# '")# ' # $+
(' $-+-'). -, ( -' $-', , $ 1 # ,-, - «$')», $ #) +) #1- ' + # $ #)' &## , 0 $1 , #,. *# ( -' , $-'# $ F-#, # +( '": 0.46 $ # 2 '")# $ 0
0,2 0,4 0,6 0,8 1
0,05 0,1 0,15 0,2 0,25 0,3 0,35 0,4 0,45 0,5 0,55 0,6 0,65 0,7 0,75 0,8 0,85 0,9 0,95z
P R
( $ 0 z = 0.15 $ 0.53 -')# $-'# $ ( $ 0 z = 0.1.
# +#, 1, ( # $-$( '" $'"+ '"
2 '"),, $ (# + '0(
(), ($ $ N+). ' # 2 $)3 ( #%
- ( $ ''"), ''.
&%+# '"3 *&& - -' $-'.
4.5 '
+ ' ) +, ( $-'0(
3 , ' ( , 2
$)3 ( " (") #% ,
$ (# * %'"0 #1 $'"+"
'" '" # [5], # ( ' & ' [21]. ' +) '" # ,- # ( )# -$ , $*# 3 ,
*$ #, #) $'"+' # ( ' & [33], -12 1680 ', $ )# $-') ' 190 ). '# [29].
') * ' & - $ 0 $ ## WordNet.
$-$''", ( ',
$ -'12 -# ', $-0,
$ #, ' , ,,…
«C##» ' , ,
«-- ».
C# # ,) $ #. +"## -
$-'1 + #1), +%
«-- »:
, ! "
# - , - ! .
6 $-'1 , # ( -0 2" # , 2 #0 - ), ', $*# $ «$ '#» # ,- '0. - ' & # ' , , ' -1-), ' , , , ! – ' . , ( ' -' 2 '"), $ )( ' ,- «$ '#» $'(# cos M = 0.71.
. 4 $+ + #" F-#)
$ -' -, $ #% –
«$ '#» «$ '#» (.. $- $ N++') $ # «C##».
C . 4. ' + «$ '#» «$ '#»
-, F-# $ $,- «$
'#» $)3 ' (# - +.
" 2 $ '# #
" " -' - 1 '(3 +'", $ #, ') ) #'
# " ( -'" ',
#' <- " - # 2 ' -1-). # #'"), +( F-#)
$-' ' % 5.
' % 5. F-# $ ' + «$ '#» «$
'#»
«---
» «C##» ) ($)
'# 0.21 0.22 0.60
'# 0.50 0.7 0.78
-, ( , , «$
'#» - +( '" '(3 +'").
, ( -' ), 1 $
$# z=0.5 $) -0 - ( # , . . $+ % FN FP
$-0.
# +#, ( -, ( -' - 1
$ #'#), +'" $ # (
#% (), 1 % "
,- +' ( &# $ ' , $ -'")#
'#, ( $+' +( '" *&&
$'"+" $) +, $ 02 .
5 ( )
- '- $% & $ # ' # # ( #% '")#
()# # , +), $ -
# , $ (# $ '0() $'''")
&#) #& +), ', 1 , $& '"), $-. (
$ ' # ) TextTiling, $'"+02 ''"0 &#% 0 +
#1- - # (# . '- ' ( #% ,
$#, +# '"+2 , ' (
$) #1- # , $) ".
$-') $ #'") # % $#
#% -' +' (), +). $ # +) $-1-, ( $-'0(
3 , ' ( , 2
$)3 ( #% . 0
0,2 0,4 0,6
0,05 0,1 0,15 0,2 0,25 0,3 0,35 0,4 0,45 0,5 0,55 0,6 0,65 0,7 0,75 0,8 0,85 0,9 0,95
F
z
*
[1] Beeferman, Douglas, Adam Berger, and John Lafferty. Statistical models of text segmentation.
Machine Learning, 34(1-3), February 1999.
[2] David M. Blei, Andrew Y. Ng, and Michael I.
Jordan. 2003. Latent Dirichlet allocation. Journal of Machine Learning Research, 3, p. 993–1022.
[3] Anja Habacha Chaibi, Marwa Naili, Samia Sammoud. Topic segmentation for textual document written in Arabic language. 18th International Conference on Knowledge-Based and Intelligent Information & Engineering Systems - KES2014. Procedia Computer Science 35 (2014), p. 437–446
[4] Choi, F.Y.Y. 2000. Advances in domain independent linear text segmentation. In Proceedings of the 1st Meeting of the North American Chapter of the Association for Computational Linguistics, p. 26–33.
[5] G. Dias, E. Alves, J.G.P.Lopes. Topic Segmentation Algorithms for Text Summarization and Passage Retrieval: An Exhaustive Evaluation. AAAI'07 Proceedings of the 22nd national conference on Artificial intelligence - Volume 2. p. 1334–1339
[6] Lan Du, Wray Buntine, and Mark Johnson. 2013.
Topic Segmentation with a Structured Topic Model. In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, p. 190–200, Atlanta, Georgia.
[7] Jacob Eisenstein/ Hierarchical Text Segmentation from Multi-Scale Lexical Cohesion. NAACL '09 Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics, p.
353–361
[8] Jacob Eisenstein and Regina Barzilay. 2008.
Bayesian Unsupervised Topic Segmentation. In Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing, p. 334–343, Honolulu, Hawaii.
[9] Dominik Flejter, KarolWieloch, Witold
Abramowicz. Unsupervised Methods of Topical Text Segmentation for Polish. Balto-Slavonic Natural Language Processing 2007, June 29, 2007, p. 51–58. Prague, June 2007. Association for Computational Linguistics.
[10] M Georgescul, A Clark, and S Armstrong. An analysis of quantitative aspects in the evaluation of thematic segmentation algorithms.
SigDIAL’06 Proceedings of the 7th SIGdial Workshop on Discourse and Dialogue, Jan.
2009.
[11] Marti A. Hearst. TextTiling: Segmenting text into multi-paragraph subtopic passages.
Computational Linguistics, 23(1), p. 33–64, March 1997.
[12] Marti A. Hearst and Christian Plaunt. 1993.
Subtopic structuring for full-length document
access. In SIGIR ’93: Proceedings of the 16th annual international ACM SIGIR conference on Research and development in information retrieval, p. 59–68, New York, NY, USA. ACM Press.
[13] Anna Kazantseva and Stan Szpakowicz. 2011.
Linear Text Segmentation Using Affinity Propagation. In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing, p. 284–293, Edinburgh, Scotland.
[14] Anna Kazantseva, Stan Szpakowicz. Hierarchical Topical Segmentation with Affinity Propagation.
In Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers, p. 37–47, Dublin, Ireland.
[15] S. Lamprier, T. Amghar, and B. Levrat. 2008.
On evaluation methodologies for text segmentation algorithms. 19th IEEE
International Conference on Tools with Artificial Intelligence - Vol.2, Jan.
[16] David YW Lee. Genres, Registers, Text Types, Domains, and Styles: Clarifying the Concepts and Navigating a Path through the BNC Jungle.
Language Learning & Technology. Vol. 5, No. 3, p. 37-72, September 2001.
[17]Hemant Misra, Franc`ois Yvon, Olivier Capp, and Joemon M. Jose. 2011. Text segmentation: A topic modeling perspective. Information
Processing and Management, 47(4), p.528–544.
[18] POS Tagging Open Xerox.
https://open.xerox.com/Services/fst-nlp- tools/Consume/Part%20of%20Speech%20Taggi ng%20%28Standard%29-178/ 2 – 14.03.15
[19] Ponte, Jay and Bruce Croft. 1997. Text segmentation by topic. In Proceedings of the First European Conference on Research and Advanced Technology for Digitial Libraries.
[20] Jeffrey C. Reynar. 1994. An automatic method of finding topic boundaries. In Proceedings of the 32nd annual meeting on Association for Computational Linguistics, p. 331–333, Morristown, NJ, USA. Association for Computational Linguistics.
[21] Martin Riedl, Chris Biemann. Text Segmentation with Topic Models. JLCL 2012 – Band 27 (1), p. 47–69.
[22] M. Scaiano, D. Inkpen. Getting More from Segmentation Evaluation. 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, p. 362–366, Montreqal, Canada, June 3-8, 2012.
[23] Stark, Heather A., 1988. What do paragraph markings do? Discourse Processes 11, p. 275–
303.
[24] Xiaojun Wan. On the effectiveness of subwords for lexical cohesion based story segmentation of Chinese broadcast news. Information Sciences 177 (2007), p. 3718–3730.
[25] Na Ye, Jingbo Zhu, Huizhen Wang, Matthew Y.
Ma, Bin Zhang. An Improved Model of Dotplotting for Text Segmentation. Journal of Chinese Language and Computing 17 (1), p. 27- 40.
[26]! .., !# .., ..
# $--1 $ $), 3 -' -# (- , ( &#% // - , . 2012. 1 (77). . 128-134
[27]!3 !.., !# .., .., ..-' #-' +-$% % '" ),
$-,- $'"+ # # + %$' // SWorld. 2013.
. 8. 1. . 81-93
[28].. '"3 -. !# ( # +)
#$"0 ' . .: 6. 2011.
[29].!. , .. . # - ( ' + SemSin //
1-- &%
$ #$"0 ' « '- 2012», , 30 # – 3 0 2012 . http://www.dialog-
21.ru/digest/2012/?type=doc
[30] 0. ' ( ( . .: !-# . 2010.
[31]#'" .!. $) + (#
(( # ' ( "
' # +) M. Black “Metaphor”) //
# . C +1 & '' . – 2013. )$. 4(24). – . 140–150.
[32].. & #. C +) '"
( : '% .– .: ' , , 2004.
[33].!. +. #$"0 # +). : +-- .-. - , 2004
Specifics of Applying Topic Segmentation Algorithms to Scientific Texts
K. Boyarsky,N. Gusarova,N. Dobrenko,E. Kanevsky, N Avdeeva
This paper considers how to apply topic segmentation algorithms to real scientific texts. To study it we used monographs on the same subject written in three languages. The corpus includes several fragments both in the original and in professional translation.
The research is based on the TextTiling algorithm that analyses how tightly adjoining parts of a text cohere. We examined how some parameters (the cutoff rate, the size of moving window and of the shift from one block to the next one) influence the segmentation quality. The optimum combinations of these parameters are defined for several languages. The studies on the Russian language argue that external lexical resources (stop-lists, classifiers, ontologies) notably upgrade the quality of segmentation.