• Aucun résultat trouvé

Исследование специфики применения алгоритмов тематической сегментации для научных текстов (Specifics of Applying Topic Segmentation Algorithms to Scientific Texts)

N/A
N/A
Protected

Academic year: 2022

Partager "Исследование специфики применения алгоритмов тематической сегментации для научных текстов (Specifics of Applying Topic Segmentation Algorithms to Scientific Texts)"

Copied!
9
0
0

Texte intégral

(1)

© . . © . . ©. .

-

boyarin9@yandex.ru natfed@list.ru graziokisa@gmail.com

© . !.

-

kanev@emi.nw.ru Assoul@yandex.ru

" # $% &

$ # ' # # (

#% '")# ()# #.

( *$ #'" +) $'"+)

#& , +), $ - # ,

$ (# '-#) $ '0()

$'''") )) &#)

#& +), ', 1 ,

$& '"), $-. (

$ ' # ) TextTiling, $'"+02 ''"0 &#% 0 + #1- - # (# . '- ' ( #%

, $#, +# '"+2 , ' ( $) #1- # ,

$) ". $-') $ #'")

# % $# #% -' +' (), +). $ # +)

$-1-, ( $-'0( 3 , ' ( , ($-', ' & , +) 2 $)3 (

#% .

1

6&& " '-'" )

## $-$-' $' , '), &#% ), ( .

'"30 -'0 , ( '0 ), 2 1 ( $+), #& , ( , () "

. -. 6, $ ', '"3 , &#% -)2) -#) +) ', '-02 , -' ( [16, 26, 30]. *# $) <# $ # ' (1 – 2 #& ), ( $+' $ #" -' , ' + ( #-), ' +#2 ,', ) . $.

# # - $

$-' 0 -$ $ ()# ()#

# (+ , - *&& "

&#% $ # , -# ). ' #, ( , -' -

$- # - % )# -'

$ )# # – $ #, ##

' '0()# '# . +'"

$ ) +$ $'"+'" $'(

' -# %' #, # )1- (0 2'" ' ) $ , ' )) + %)

# #'" ( (# '0(), ', $ )# '1 " %'"

$-' # , 1-# -#.

# +#, ,- # +%

&#% $ ( , (, 0 (-", 3 +-(

# + -' -#

$-#) (# , %' $'"+'. 6 +-(

XVII

DAMDID/RCDL’2015 «

!" », #, 13-16 2015

(2)

'" $ $ (# ( , $ ( # ), +),, $ )# $1 $'#

'- . $.

2# # ' $ -( '"3 ( ' ' #

# ( #% ( , + $ - 1). -, $ ', -# 0 -(0 *&& "

+-), ,, $-'02 , % 0 -'"), $-'1 ' , ), &# + +), ' -$' % (#., $ #, [3, 5, 9], ' '"3 , ), $, [4, 7, 21]). 1

# +'") $ # * , ' # '")# ()# # $-') - ( $ ( - - (.,

$ #, [5, 9, 11]).

" # $% &

$ # ' # # (

#% '")# ()# #.

( *$ #'" +) $'"+)

#& , +), $ - # ,

$ (# '-#) $ '0()

$'''") )) $ $-'1 #

&#) #& +), ', 1 , $& '"), $-.

'- + #" ( #%

, $#, +#

'"+2 , ' ( $) #1-

# , $) ", +) , ' ( , - % -' ' +, 1

$-'0( 3 , ' ( , . #% 1 -' )-' +# (', $&)

&# -' -' $- ' - -'" ) #.

2 $

( ' #) # (

#% + 0 +

$'- ,. $+ '- , [27, 31], (), , $'-0 $)

+ , ' (

('0), # ( +# '")) ' - ( ($'''")):

x '0) $ + – () $

# #1), $-'1 , ; x # ( $ + – $'"+

# , ' + , $ #)', -#

;

x +# '") $ + – $'"+

# # ' # +'")#

# # # -' +#2 #)'

$-302 $-'1 ;

x $'''") $ + – ' (

$-'1 , )02 , + , ( # $ +# ), '0 $''' +# ), 1

- &# )1 +#),.

'"3 202 , ' # )' + $ (1). !' #) * $ #1 +-' "

- $$). !' #) $ $$) $'"+0 ''"0 &#% 0 + #1- - # (# . - + ' +), ' # *

$$) – TextTiling [11, 12,] – + '-02 , 3:

x )$' '## +% -'

$-'; # +#, $+

$'-'"" + N;

x $'-'") ) $$ 0

$-$-'1 -' W 1-;

+ , &# ' +## k

$-$-'1 , ) -'"3#

$'"+ '"+2 3# s

$-$-'1 . +# ' # s=1, .. 0 (' $$ 0- - (W*k)- c $$ W- - (W*(k +1))-, +# $$ W- - (W*(k + 1))- $$ (W*2)- - (W*(k+2))- . -. - $$- $'-

$-$-'1 0 %;

x % ' ( ,1" - , ' ' M #1- (W*k)-

#)# # :

x , (1)

x - wn,i– n- i-# '.

x ''") # ##) &% (1)

# 0 %) #1-

## ( ' # - ' 13 $-'1 ' +%).

!' #) $$) % 0

$-' $02 , $ # . , ' # DotPlotting [20] $'"+

-' ' + ' ( + -#)

& , # $ # X Y'-)

$'1 ; ' $+ % , x y ,- - () ),

& ( $+ % , (x, y) (y,x).

*# +) #) +'"

0 -#, $'1)# -'"

- ' , ) $'"0 (.

'( $-' '-

*## $ # # - + -, : ' # # +% $' ( %,, ' # # +% $' ( #. C+ # * - '

' # 99 [4], # ' ()# +#

+' + # ' ( ,1

#1- # - , &#, +#

#-# - # ( $##

,- +) # #'" $'"0 *

#).

, 1 ,

2 2

, 1 ,

cos , 0 cos 1

n i n i n

i

n i n i

n n

w w

w w

M

¦

d M d

¦ ¦

(3)

' % 1. D ), ( +(

(

' & ( $ ( E+) (

C+# &#

($(. +) 1. Romme L' Art de la Marine, u Principes t Préceptes Generaux d

l'Art de Construire, d'Armer , de Manœuvrer et de Conduire ds Vasseaux, par . R##e. La Rochelle, 1787. Chapitres VII, VIII.

&%+ 110261

2. C## , ' ') (' $ ', (02 , 1 , $' 1- ', ( ) $- # C###,

$-# 1 !-#

$&# % # ( ' 2. I" 1, 2.

' &%+ &' $ !'- J 3. -, 1793. ') 7, 8.

161152

3. U-boat Williamson G., Johnson L.: U-Boat crew 1914-45/

ed. Osprey Publishing, Great Britain, 1995.

' 60563

4. -- --) &' # . 1914–1945 / . '"#;

. '. .!. '"%. – .: «+-'"

!», 2003.

60560

5. ) ( – 23800

-'1 %') - #- & % +), ' # (#., $ #, [5, 6, 8, 13, 17]),

$+'02 ,, $# # '0 + , ' # (" - $) + ,

$ 02 . $ #, -' ( + $ (2) $-' $-'0(" 3 ' ($ #, WordNet) ' "

0 # *&& % , 102 ( (# ' 3 ''% -# ($ #, ) [5]. ' ( + $ (4) $-' -# $'"+" $ # + ##

&% ' DotPlotting #) ' ( ,- # #1- ##

[25].

!' + #) ), #1 )-' " - $'#, $$02 , , - 0 '"0 $ $ ), #. -$),, '"3 #-,

$-'), ' , + -' ' +). +)) $$)

$ # '" +-( # (

#% +() ' [3, 24]. -),, -' # , +), '0( , -#

-$ 0 -( $') ' # - ) ' ( ). , -" ,, *&& " ,

$-'1), ' # ( + + + $#

+ )# .

$'- # $'( ' + ' #) , ( #%

(#., $ #, [7, 14, 21]). + '"3 + , '1 #-'" ' ( , + ' - ## $-' '

$ ##, $ (# # #"

'"3 ' ( #, $' 1- ' +) - + #.

# (0 -' $-,-

$-' ' +#2 ,' (LDA) [2], 3 $'"+# #3 # ( . $ #, [7] LDA ' + , ( ' #,

$+'02 ) " - '

#% . !' # TopicTiling [21],

' ( TextTiling, )( '

#) ,- #1- - #

## , - $'"+0 () (# ', () - & #, $- '"

( ), -' 1- '

$#2"0 LDA.

!' #) * $ $+)0 # '(30 *&& ", (# '"3 (02

$ (# ' 1 (

$-' ' + ## . -'

" (02 $ ' + #) -'1) $ -'1" -# -# [21], ( - 1 -" -' '" ( ,

%" ## +) '""0.

2 " $- )

*$ #'" % $#

#% , 1), ( + +%

# ( #% '"), (),

. C# +) –

' #% # ,-

#1- ## .

(4)

C . 1. ! +# () # ( 1) %) # (H)

$ -, $, $- -' «Romme»

3 # " %

3.1 # ! ) +'' " + #& , (

# , +), ( , ' ,

&%+ ) $ - # . D ( $-') '. 1. '-#)

$ 3' $'''") &#) #&

+) ' ($+. 1, 3), 1 ,

$& '"), $- ($+. 2, 4).

$-,- # $'# - ( -# -' +), $+' ( # - '-" +)) $#

#% . I( +#

' # &%+# +), )$''"

$#2"0 OpenXerox [18],

# +) – $#2"0 (

' + SemSin [29]. $ $-' -'

1- )' &# (0 ( ' +. ( $ )' &# , 2 + 20 ),

&#, +), '()# +# + ($+. 5 '. 1). $-

#% 2''" $- +' ()# #-# ('. 2).

' % 2. ) $- +( $

N

'## +% + -' $-' N+ '## +% + (( +# +

2 '"),

N+++ '## +% + (( +# + 2 '"),, $ ''"), ''

N++' '## +% + (( +# +

$-'0( 3 ' &

3.2 !"

( + ' # )'

) TextTiling, ' $+(

' + #, +# + 0 #.

'-'"" 3 ' # +(

$# $- $ ) )3 (#. $. 2).

,- *$ # " ' " '-02

$#) ' #: +# ' W*k[];

+# $) #1- '# s*k [].

# , - *$ # $'"+) ' $# -' ), 02 -' +%.

3.3

' % ( #%

$-'1 - # , # ( ' 3 recall-precision [12], - [19],

#) PP Pk [1], # WindowDiff. 1- + , - -.

$ #, # WindowDiff

$'1 &), % #,

&# ), $ + ' , %, $'(), % #)#

' ##, $-', '"+2 . R#

( ' , -12 , 3 ( $-') %), # 2 ( ' . -

$'-02 '- [10, 15, 22] ) ' - - # WindowDiff. -

% '1 $-') (false positive, FP)

$$2) (false negative, FN) %), ) -'1) #" +0 +( #" + # +-( #% . # ,

# WindowDiff $+' (" ' ( 3 $-' %), 1 $ - +)3 +( 3 # (' % . + * # $-'1 - #- & %

# WindowDiff (#., $ #, [22]).

3# '- -' % (

#% $'"+'" ' F-# [28]:

0 0,2 0,4 0,6 0,8

cos M 1

+ +

A

(5)

R P

R F P

2

, (2)

-

P TP

H

– (",

TP FN R TP

$', FP – ' ( '1), +( , FN – ' ( $$2), +( , H – ( ' -), % #.

( *' -' #) +'

*$0 +# % #) # (A), #.

. 1. #), &#, , % 1 # - - .

- $# #% $- ' '-02 # +#. !' + '" +(

#) (1) + # " #

$ z. R( cos M, #"3 '

) z, # ' " # #), . .

% # (H). R# $- '"

$' $'(), $'-'"

. $ '" $- (TP) # ' "

'( , - , $'-'", (A H) )' #() ''") # ##) («$')») -' - 1 +% ' -' - , +%. ' $-' )#, $'", $+' -'") ' +, ( $# +% #)

$) $' # " #, , 2 ,

$,- $ ,- '" '-02# +%.

'") «$')» H #(' " '1)

(FP), «$')» A – $$2) (FN). '

$-' '(3 +( F-#) "

$ z" ' 0 - 1.

# #, ( #) )3 #

% 0 ( #% '"

$&# $( 0 +#1"

($ # +% ) $# ' #

#% , # -' +), +-(

+ 3 ( $'), ( 1 -' (), . 6

$'1 ' 2 - # ## $'"+ ) ( # F-#) (2), -

# " $ # +% ,

# ## – '(3 3 P R.

' #- + : , $ #, ' " ) *&& % ) #1- P R.

4 & ! "!

4.1 "

( $ #'"), $#

#% +# ' #

TextTiling [11] #- '"+2

(') +## (W*k) + k

$-$-'1 -' W 1-, $

$) ' s=(W*k)/2. - 3

*$ #) $'"+ # +),

# % s, W k$+' , ( '0) +(

$) , ' () ', 10

" & , $'), - % 02 %) #)'), &#, $ ( (+0 (,) $ # $ - . 2).

# #, ( * +'" $- )-#, -')# *$ #'"# '- [5].

*# 3 , -'"3 , *$ #,

$) ' $'"+'".

C . 2. R( #) -' «U-boat» c

$- $ N+: –W*k=10, s=0;

–(W*k)=20, s=(W*k)/2;

4.2 "

#('" )3, +# ' # TextTiling [11, 12] ( - %) ' + $'"+) & -' ),

$'1 $'(02 , % ' - ' 13 $-'1 ' +%. 3

*$ #) $+' , ( $ # '"

'")# ()# # ( ' + *#

'( -' '" ('. 3). 6 +

#, ( $ # (10 2 '"),) '() +# ' $ -

$' 0 '"3 ( ' «'1), »,

#"3 (", $ '"3# (40 2.) ' " , ( $ '"0 %, ( $ - $- 0

$'). # +'" -' ( «U- boat» $- $ N+ $ - ' % 3, ) 1–3.

' % 3. ' +# '

S 1 2 3 4

C+# ' [] 10 25 40 +%

F-# 0.06 0.04 0.03 0.17

#) 1 '-'

$-$'1 , ( #)') %) '- " %, +%. 1 +# ", ( ' $-') $ ( ) #

$ *# $-. $ #, [32] :

«...#1 +", ( +% (# –

#'"), & ( )-') *'#

, -12 - +0 #)'" '

&#». - ), ' [23], -' $ &#'")# '02 # (+%# ' +-'#) $+' )-' "

(6)

$-#) -#: %) +%

# ( , ' # $-", -' +%) + $ +( -#

($ #, ,-1)), '"3 +%) # -1" '"

$-#.

' *$ # #) $'"+'

$# -' ), $-02 +%#.

' 3# +%) $ - ' "

'-02#, -( '"3

"0 +%) $-'0 +' ' $' , ), 2 ', $-02 , '#

102 .

C+'") ' + -# 0 + ' ( +( F *# '( $ 0 & -' ' ('. 3, 4).

*# -'"3# -' , $'"+'" (' $ +%#.

4.3

$-'02 # $## -' $ 3 $- %) # -#

# ' " $ z. #

#"3 (" P, . . $'0 $$

%, $' R, . . '"3 ( ' % 1 . # #, 2

$ #'") " $ ( + F-#) ( . 2). $+' 3 *$ #),

$ #'" +( z '" + $

$- . ' ,

$- , # N++' ('. 2),

# #'") +( F-#) - 0 $ z=0.1…0.15. $-'0( # -$' '"), (N++'), '

$- $ +-' 4.5, # #'"

+( F-#) - $ ܟ'"3 ,

+( , z( . 3).

C . 3. R #" P, R F-#) $ z

4.4 "

# )' '-) +#1

$ # +% $- + # +) . ( -, ( $ $-

& ( $-' $ ' -' ' + +), +),, #1 $-')# +) $#

+ . ( , N+ N+ ( '0( # ' (), ') & 0 '0) $ + . N+++ ( ' -) $, ( $-'

# 0 '" '1) ( ( ), - '0)#, ( ( &

$'''") $ + .

' + #), ,, $# # -),

$-', '0(' " ' () -1'") ': – , ,, U-boat,jacket,war– ' ,, voile ($), poulie ('), mat (#(), fig (& , )' ), vergue() – &%+ ,.

R( F-#) -' $ #'" $

, $'() -' +' (),

$- «$ '#» (.. -' ,

$- , # N++'), $ -) '. 4. #'") +( )-')

$'1 )# 3 &#. # #, ( $'() +( $' 0 +'"# , , [3, 5].

' % 4.

N+ N+

-- 0.19 0.21 0.11

U-boat 0.24 0.17 0.19

Romme 0.46 0.44 0.53

C## 0.21 0.22 0.18

) 0.60

!' + ' %) $+), ( (), , -' +'#

$-+'# +( #) '

$', '' $ # (# ' +.

+) « +)#» ( + $'"+# ' . 1 # $ '" '"3 , ), -1 + $ # ,-' -$' '"),

# ( -' +( '" )3.

6 +'") ' $ $ "

$#2"0 - ## #) ,-

#1- +%# $ # ( «Romme»

( . 1).

$'3)# '")# ' # $+

(' $-+-'). -, ( -' $-', , $ 1 # ,-, - «$')», $ #) +) #1- ' + # $ #)' &## , 0 $1 , #,. *# ( -' , $-'# $ F-#, # +( '": 0.46 $ # 2 '")# $ 0

0,2 0,4 0,6 0,8 1

0,05 0,1 0,15 0,2 0,25 0,3 0,35 0,4 0,45 0,5 0,55 0,6 0,65 0,7 0,75 0,8 0,85 0,9 0,95z

P R

(7)

( $ 0 z = 0.15 $ 0.53 -')# $-'# $ ( $ 0 z = 0.1.

# +#, 1, ( # $-$( '" $'"+ '"

2 '"),, $ (# + '0(

(), ($ $ N+). ' # 2 $)3 ( #%

- ( $ ''"), ''.

&%+# '"3 *&& - -' $-'.

4.5 '

+ ' ) +, ( $-'0(

3 , ' ( , 2

$)3 ( " (") #% ,

$ (# * %'"0 #1 $'"+"

'" '" # [5], # ( ' & ' [21]. ' +) '" # ,- # ( )# -$ , $*# 3 ,

*$ #, #) $'"+' # ( ' & [33], -12 1680 ', $ )# $-') ' 190 ). '# [29].

') * ' & - $ 0 $ ## WordNet.

$-$''", ( ',

$ -'12 -# ', $-0,

$ #, ' , ,,…

«C##» ' , ,

«-- ».

C# # ,) $ #. +"## -

$-'1 + #1), +%

«-- »:

, ! "

# - , - ! .

6 $-'1 , # ( -0 2" # , 2 #0 - ), ', $*# $ «$ '#» # ,- '0. - ' & # ' , , ' -1-), ' , , , ! – ' . , ( ' -' 2 '"), $ )( ' ,- «$ '#» $'(# cos M = 0.71.

. 4 $+ + #" F-#)

$ -' -, $ #% –

«$ '#» «$ '#» (.. $- $ N++') $ # «C##».

C . 4. ' + «$ '#» «$ '#»

-, F-# $ $,- «$

'#» $)3 ' (# - +.

" 2 $ '# #

" " -' - 1 '(3 +'", $ #, ') ) #'

# " ( -'" ',

#' <- " - # 2 ' -1-). # #'"), +( F-#)

$-' ' % 5.

' % 5. F-# $ ' + «$ '#» «$

'#»

«---

» «C##» ) ($)

'# 0.21 0.22 0.60

'# 0.50 0.7 0.78

-, ( , , «$

'#» - +( '" '(3 +'").

, ( -' ), 1 $

$# z=0.5 $) -0 - ( # , . . $+ % FN FP

$-0.

# +#, ( -, ( -' - 1

$ #'#), +'" $ # (

#% (), 1 % "

,- +' ( &# $ ' , $ -'")#

'#, ( $+' +( '" *&&

$'"+" $) +, $ 02 .

5 ( )

- '- $% & $ # ' # # ( #% '")#

()# # , +), $ -

# , $ (# $ '0() $'''")

&#) #& +), ', 1 , $& '"), $-. (

$ ' # ) TextTiling, $'"+02 ''"0 &#% 0 +

#1- - # (# . '- ' ( #% ,

$#, +# '"+2 , ' (

$) #1- # , $) ".

$-') $ #'") # % $#

#% -' +' (), +). $ # +) $-1-, ( $-'0(

3 , ' ( , 2

$)3 ( #% . 0

0,2 0,4 0,6

0,05 0,1 0,15 0,2 0,25 0,3 0,35 0,4 0,45 0,5 0,55 0,6 0,65 0,7 0,75 0,8 0,85 0,9 0,95

F

z

(8)

*

[1] Beeferman, Douglas, Adam Berger, and John Lafferty. Statistical models of text segmentation.

Machine Learning, 34(1-3), February 1999.

[2] David M. Blei, Andrew Y. Ng, and Michael I.

Jordan. 2003. Latent Dirichlet allocation. Journal of Machine Learning Research, 3, p. 993–1022.

[3] Anja Habacha Chaibi, Marwa Naili, Samia Sammoud. Topic segmentation for textual document written in Arabic language. 18th International Conference on Knowledge-Based and Intelligent Information & Engineering Systems - KES2014. Procedia Computer Science 35 (2014), p. 437–446

[4] Choi, F.Y.Y. 2000. Advances in domain independent linear text segmentation. In Proceedings of the 1st Meeting of the North American Chapter of the Association for Computational Linguistics, p. 26–33.

[5] G. Dias, E. Alves, J.G.P.Lopes. Topic Segmentation Algorithms for Text Summarization and Passage Retrieval: An Exhaustive Evaluation. AAAI'07 Proceedings of the 22nd national conference on Artificial intelligence - Volume 2. p. 1334–1339

[6] Lan Du, Wray Buntine, and Mark Johnson. 2013.

Topic Segmentation with a Structured Topic Model. In Proceedings of the 2013 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, p. 190–200, Atlanta, Georgia.

[7] Jacob Eisenstein/ Hierarchical Text Segmentation from Multi-Scale Lexical Cohesion. NAACL '09 Proceedings of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics, p.

353–361

[8] Jacob Eisenstein and Regina Barzilay. 2008.

Bayesian Unsupervised Topic Segmentation. In Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing, p. 334–343, Honolulu, Hawaii.

[9] Dominik Flejter, KarolWieloch, Witold

Abramowicz. Unsupervised Methods of Topical Text Segmentation for Polish. Balto-Slavonic Natural Language Processing 2007, June 29, 2007, p. 51–58. Prague, June 2007. Association for Computational Linguistics.

[10] M Georgescul, A Clark, and S Armstrong. An analysis of quantitative aspects in the evaluation of thematic segmentation algorithms.

SigDIAL’06 Proceedings of the 7th SIGdial Workshop on Discourse and Dialogue, Jan.

2009.

[11] Marti A. Hearst. TextTiling: Segmenting text into multi-paragraph subtopic passages.

Computational Linguistics, 23(1), p. 33–64, March 1997.

[12] Marti A. Hearst and Christian Plaunt. 1993.

Subtopic structuring for full-length document

access. In SIGIR ’93: Proceedings of the 16th annual international ACM SIGIR conference on Research and development in information retrieval, p. 59–68, New York, NY, USA. ACM Press.

[13] Anna Kazantseva and Stan Szpakowicz. 2011.

Linear Text Segmentation Using Affinity Propagation. In Proceedings of the 2011 Conference on Empirical Methods in Natural Language Processing, p. 284–293, Edinburgh, Scotland.

[14] Anna Kazantseva, Stan Szpakowicz. Hierarchical Topical Segmentation with Affinity Propagation.

In Proceedings of COLING 2014, the 25th International Conference on Computational Linguistics: Technical Papers, p. 37–47, Dublin, Ireland.

[15] S. Lamprier, T. Amghar, and B. Levrat. 2008.

On evaluation methodologies for text segmentation algorithms. 19th IEEE

International Conference on Tools with Artificial Intelligence - Vol.2, Jan.

[16] David YW Lee. Genres, Registers, Text Types, Domains, and Styles: Clarifying the Concepts and Navigating a Path through the BNC Jungle.

Language Learning & Technology. Vol. 5, No. 3, p. 37-72, September 2001.

[17]Hemant Misra, Franc`ois Yvon, Olivier Capp, and Joemon M. Jose. 2011. Text segmentation: A topic modeling perspective. Information

Processing and Management, 47(4), p.528–544.

[18] POS Tagging Open Xerox.

https://open.xerox.com/Services/fst-nlp- tools/Consume/Part%20of%20Speech%20Taggi ng%20%28Standard%29-178/ 2 – 14.03.15

[19] Ponte, Jay and Bruce Croft. 1997. Text segmentation by topic. In Proceedings of the First European Conference on Research and Advanced Technology for Digitial Libraries.

[20] Jeffrey C. Reynar. 1994. An automatic method of finding topic boundaries. In Proceedings of the 32nd annual meeting on Association for Computational Linguistics, p. 331–333, Morristown, NJ, USA. Association for Computational Linguistics.

[21] Martin Riedl, Chris Biemann. Text Segmentation with Topic Models. JLCL 2012 – Band 27 (1), p. 47–69.

[22] M. Scaiano, D. Inkpen. Getting More from Segmentation Evaluation. 2012 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, p. 362–366, Montreqal, Canada, June 3-8, 2012.

[23] Stark, Heather A., 1988. What do paragraph markings do? Discourse Processes 11, p. 275–

303.

[24] Xiaojun Wan. On the effectiveness of subwords for lexical cohesion based story segmentation of Chinese broadcast news. Information Sciences 177 (2007), p. 3718–3730.

(9)

[25] Na Ye, Jingbo Zhu, Huizhen Wang, Matthew Y.

Ma, Bin Zhang. An Improved Model of Dotplotting for Text Segmentation. Journal of Chinese Language and Computing 17 (1), p. 27- 40.

[26]! .., !# .., ..

# $--1 $ $), 3 -' -# (- , ( &#% // - , . 2012. 1 (77). . 128-134

[27]!3 !.., !# .., .., ..-' #-' +-$% % '" ),

$-,- $'"+ # # + %$' // SWorld. 2013.

. 8. 1. . 81-93

[28].. '"3 -. !# ( # +)

#$"0 ' . .: 6. 2011.

[29].!. , .. . # - ( ' + SemSin //

1-- &%

$ #$"0 ' « '- 2012», , 30 # – 3 0 2012 . http://www.dialog-

21.ru/digest/2012/?type=doc

[30] 0. ' ( ( . .: !-# . 2010.

[31]#'" .!. $) + (#

(( # ' ( "

' # +) M. Black “Metaphor”) //

# . C +1 & '' . – 2013. )$. 4(24). – . 140–150.

[32].. & #. C +) '"

( : '% .– .: ' , , 2004.

[33].!. +. #$"0 # +). : +-- .-. - , 2004

Specifics of Applying Topic Segmentation Algorithms to Scientific Texts

K. Boyarsky,N. Gusarova,N. Dobrenko,E. Kanevsky, N Avdeeva

This paper considers how to apply topic segmentation algorithms to real scientific texts. To study it we used monographs on the same subject written in three languages. The corpus includes several fragments both in the original and in professional translation.

The research is based on the TextTiling algorithm that analyses how tightly adjoining parts of a text cohere. We examined how some parameters (the cutoff rate, the size of moving window and of the shift from one block to the next one) influence the segmentation quality. The optimum combinations of these parameters are defined for several languages. The studies on the Russian language argue that external lexical resources (stop-lists, classifiers, ontologies) notably upgrade the quality of segmentation.

Références

Documents relatifs

Поэтому при поиске сначала строится СП запроса, а затем для каждого введенного пользователем словосочетания система со- ставляет набор слов, парные

В данной работы выполнено исследование диссертаций и авторефератов с целью изучения структуры научных связей ученого (научное окружение ученого),

“Evaluating discourse phenomena in neural machine translation,” in Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational

Natural language processing is such an important subject that can not only afford to develop the field of artificial intelli- gence [17], but also help our everyday lives, eg.: lives

We will use a representation of Devroye [4, 5] for the binary search tree, and a well-known bijection between binary trees and recursive trees, together with different applications

In Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics : Human Language Technologies, Volume 1 (Long Papers), p.

Two performance measures were used in this paper: the optimization reliability, defined as the probability of finding the true optimum, and the expected maximum fitness function 11.

While deep learning has transformed the nat- ural language processing (NLP) field and im- pacted the larger computational linguistics community, the rise of neural networks is