๋ฐ์ดํฐ ์ค๋น
๋ฐ์ดํฐ์
AI Hub์ ํ๊ตญ์ด-์์ด ๋ฒ์ญ ๋ง๋ญ์น(๋ณ๋ ฌ) ์ฌ์ฉํ๋ค.
1_๊ตฌ์ด์ฒด(1).xlsx์ 200,000๊ฐ์ ํ๊ตญ์ด-์์ด ์์ ์ด์ฉํด ์ดํ๋ฅผ ๊ตฌ์ฑํ๊ณ ํ์ตํ๋ค.
ํ์ฑ
xlsx ํ์ผ์ด๋ฏ๋ก pandas์ openpyxl์ ์ค์นํด ํ์ฑํ๊ณ , ํ
์คํธ ํ์ผ๋ก ์ ์ฅํ๋ค. ์ ์ฅํ ๋ ๋ฏธ๋ฆฌ ์ ์ฒ๋ฆฌ ๋ฐ tokenize ํ ์ ์ฅํ๋ค.
Copy # -*- coding: utf-8 -*-
import torchtext
from konlpy . tag import Okt
import pandas as pd
import re
token_ko = Okt (). morphs
token_en = torchtext . data . utils . get_tokenizer ( 'basic_english' )
lines = pd . read_excel ( './1_๊ตฌ์ด์ฒด(1).xlsx' ,names = [ 'sid' , 'src' , 'tar' ])
del lines [ 'sid' ]
def textprocess ( kot , ent ):
kot = kot . lower (). strip ()
kot = re . sub ( r " ( [.!?] ) " , r " \1" , kot)
kot = re . sub ( r "[ ^ ใฑ-ใ
ใ
-ใ
ฃ๊ฐ-ํฃ,.!?]" , r " " ,kot)
kot = re . sub ( r " \s + " , r " " ,kot)
kot = ' ' . join ( token_ko (kot))
ent = ent . lower (). strip ()
ent = re . sub ( r " ( [.!?] ) " , r " \1" , ent)
ent = re . sub ( r "[ ^ a-zA-Z,.!?] + " , r " " , ent)
ent = re . sub ( r " \s + " , r " " ,ent)
ent = ' ' . join ( token_en (ent))
return kot , ent
with open ( './kor.txt' , 'w' ,encoding = 'utf-8' ) as ko , open ( './eng.txt' , 'w' ,encoding = 'utf-8' ) as en :
length = len (lines)
print (f "Total lines = {length} " )
i = 0
for i in lines . index :
if (i % 2000 == 0 ) :
print (f '{i/length*100:.2f}%' )
text = lines . loc [ i ]
kot = text [ 'src' ]
ent = text [ 'tar' ]
kot , ent = textprocess (kot,ent)
ko . write (kot)
ko . write ( '\n' )
en . write (ent)
en . write ( '\n' )
๋ฌธ์ฅํด ์กด์ฌํ๋ ๋ฌธ์ฅ๋ถํธ ์์ ๊ณต๋ฐฑ์ ์ถ๊ฐํ๊ณ ํ๊ธ/์์ด์ ํ์ํ ๋ฌธ์ฅ๋ถํธ๋ง ๋จ๊ธด๋ค. ์ง์ฐ๋ฉฐ ์๊ธด ์ฌ๋ฌ๊ฐ์ ๊ณต๋ฐฑ์ ํ๋์ ๊ณต๋ฐฑ์ผ๋ก ๋ฐ๊พผ๋ค. ๊ทธ ํ tokenizeํ๋ค.
์ดํ์ง / ํ ํฌ๋์ด์
Sentencepiece๋ฅผ ์ฌ์ฉํ๋ค. konlpy์ Okt์ SpaCy๋ฅผ ์ฌ์ฉํด ๊ฐ๊ฐ ํ๊ตญ์ด์ ์์ด๋ฅผ tokenizeํ๋ค. vocab ์ ์์ torchtext๋ฅผ ์ด์ฉํ๋ค.
Copy from konlpy . tag import Okt
import torch
from torchtext . vocab import build_vocab_from_iterator
import torchtext
okt = Okt ()
spc = torchtext . data . utils . get_tokenizer ( 'basic_english' )
def eniter ( path ):
with open (path,encoding = 'utf-8' ) as file :
lines = file . readlines ()
for line in lines :
line = line . split ( '\t\t\t\t' ) [ 1 ]
line = line . lower (). strip ()
yield spc (line)
def koiter ( path ):
with open (path,encoding = 'utf-8' ) as file :
lines = file . readlines ()
for line in lines :
line = line . split ( '\t\t\t\t' ) [ 0 ]
line = line . lower (). strip ()
yield okt . morphs (line)
vocab_en = build_vocab_from_iterator ( eniter ( './koreng.txt' ),min_freq = 10 ,specials = [ "<pad>" , "<unk>" , "<s>" , "<\s>" ])
vocab_en . set_default_index (vocab_en[ "<unk>" ])
torch . save (vocab_en, 'vocab_en.pth' )
vocab_ko = build_vocab_from_iterator ( koiter ( './koreng.txt' ),min_freq = 10 ,specials = [ "<pad>" , "<unk>" , "<s>" , "<\s>" ])
vocab_ko . set_default_index (vocab_ko[ "<unk>" ])
torch . save (vocab_ko, 'vocab_ko.pth' )
ํน์ ๊ธฐํธ๋ <pad>, <unk>, <s>, <\s>๊ฐ ์๊ณ ๊ฐ๊ฐ 0, 1, 2, 3๋ฒ์ด๋ค. <s>๋ ๋ฌธ์ฅ์ ์์, <\s>๋ ๋ฌธ์ฅ์ ๋ ํ ํฐ์ด๋ฉฐ <pad>๋ ๊ธธ์ด๊ฐ ๋ค๋ฅธ ์ฌ๋ฌ ๋ฌธ์ฅ์ ๋ณ๋ ฌํํ๊ธฐ ์ํด ๋น ๊ณต๊ฐ์ ์ฑ์ฐ๋๋ฐ ์ฌ์ฉํ๋ค.
kor.txt์ eng.txt๋ฅผ ์ด์ฉํด ์คํํ ๊ฒฐ๊ณผ ๋ค์ ํ์ผ์ด ์์ฑ๋๋ค.
๋ชจ๋ธ ๋ง๋ค๊ธฐ
์๋ฐฉํฅ ์ธ์ฝ๋์ ์ดํ
์
์ ์ฌ์ฉํ multilayer Seq2seq๋ฅผ ๋ง๋ค ๊ฒ์ด๋ค.
multilayer์ด๋ฏ๋ก ์์ ๊ทธ๋ฆผ๊ณผ ๊ฐ์ ๊ตฌ์กฐ๊ฐ ๋์จ๋ค.
์ธ์ฝ๋๊ฐ ์๋ฐฉํฅ์ด๋ฏ๋ก ๋์ฝ๋๋ก hidden state๋ฅผ ๋๊ธธ ๋ ๋ฐ๋์ ์ฐจ์์ ์ ๋ฐ์ผ๋ก ์ค์ฌ์ฃผ์ด์ผ ํ๋ค.
์ธ์ฝ๋
Copy class Encoder ( nn . Module ):
def __init__ ( self , n_input , n_hidden , n_layers , dropout = 0.3 ):
super (Encoder,self). __init__ ()
#Params
self . n_input = n_input #vocab size
self . n_hidden = n_hidden # embed size / hidden state / rnn output size
self . n_layers = n_layers
#Layers
self . embedding = nn . Embedding (n_input,n_hidden,padding_idx = 0 )
self . gru = nn . GRU (n_hidden,n_hidden,n_layers,bidirectional = True ,dropout = dropout)
self . projection = nn . Linear ( 2 * n_hidden,n_hidden)
self . dropout = nn . Dropout (p = dropout)
def forward ( self , x , h_0 , lengths ):
# x : (L,N)
x = self . dropout (self. embedding (x))
x = nn . utils . rnn . pack_padded_sequence (x, lengths)
x , h_t = self . gru (x,h_0) # h_t: (2*n_layers, N, n_hidden), x : (L, N, bi*n_hidden)
x , _ = torch . nn . utils . rnn . pad_packed_sequence (x)
x = torch.tanh(self.projection(torch.cat((x[:,:,self.n_hidden:],x[:,:,:self.n_hidden]),dim=2))) # x : (L, N, n_hidden)
return x , h_t
GRU Cell์ ์ด์ฉํ RNN์ ๋ง๋ ๋ค. ์๋์ธต๊ณผ ์๋ฒ ๋ฉ ์ฐจ์์ด ๊ฐ๋ค๊ณ ์ค์ ํ๋ค.
(20, 22๋ฒ ์ค) nn.utils.rnn.pack_padded_sequence(x, lengths)
๋ถ๋ถ์ด ์๋๋ฐ, ์ด๋ ์ฌ๋ฌ ๊ฐ์ ๋ฐฐ์น๋ฅผ ํ๋ฒ์ ์ฐ์ฐํ ๋, <pad> ํ ํฐ์ ๊ณ์ฐํ์ง ์๋๋ก ๋ง๋ค์ด์ค๋ค. ๋จ, ์
๋ ฅ ๋ฐฐ์น์ ๊ธธ์ด๋ฅผ ๋ด๋ฆผ์ฐจ์์ผ๋ก ์ ๋ ฌํด ์ค ํ์๊ฐ ์์ผ๋ฏ๋ก, train ์ ์ฒ๋ฆฌํ๋ค.
(24๋ฒ ์ค) GRU cell์ ์ถ๋ ฅ์ผ๋ก ๊ทธ timestep์ hidden state๋ฅผ ์ถ๋ ฅํ๋ค. ์ด ์ฐจ์์ ( L , N , 2 ร H ) (L,N,2\times H) ( L , N , 2 ร H ) ์ด๋ค. ํ์ ๋์ฝ๋์์ attention์ ์คํํ ๋ ์ฐจ์์ ๋ง์ถ๊ธฐ ์ํด concatํ๊ณ ํตํฉํ๋ค.
๋ง์ง๋ง hidden state๋ ๊ทธ๋๋ก ์ถ๋ ฅํ๋ค.
๋์ฝ๋
Copy class AttnDecoder ( nn . Module ):
def __init__ ( self , n_input , n_hidden , n_layers , dropout = 0.3 ):
super (AttnDecoder,self). __init__ ()
#Params
self . n_input = n_input
self . n_hidden = n_hidden
self . n_layers = n_layers
#Layers
self . dropout = nn . Dropout (p = dropout)
self . embedding = nn . Embedding (n_input,n_hidden,padding_idx = 0 )
self . gru = nn . GRU (n_hidden * 2 ,n_hidden,n_layers,dropout = dropout)
self . fc = nn . Linear (n_hidden * 3 ,n_input)
#Attention weights
self . Wq = nn . Linear (n_hidden, n_hidden, bias = False )
self . Wk = nn . Linear (n_hidden, n_hidden, bias = False )
self . Wc = nn . Linear (n_hidden, 1 ,bias = False )
def forward ( self , x , h_prev , enc_hiddens , mask ):
x = x . unsqueeze ( 0 )
x = self . dropout (self. embedding (x)) #(1,N) -> (1,N,n_hidden)
scores = self . Wc (torch. tanh (self. Wq (h_prev[ - 1 ]. unsqueeze ( 0 )) + self. Wk (enc_hiddens))). squeeze ( 2 ) # (L, N)
scores = torch.softmax(torch.masked_fill(scores, mask = (mask == False), value = -float('inf')),dim=0).transpose(0,1).unsqueeze(1) # (N,1,L)
enc_hiddens = enc_hiddens . transpose ( 0 , 1 ) #(N, L, n_hidden)
attn = torch . bmm (scores, enc_hiddens). transpose ( 0 , 1 ) # (1, N, n_hidden)
out,h = self.gru(torch.cat((attn,x),dim=2),h_prev) #h_t: (n_layers, N, n_hidden) x: (1, N, n_hidden)
out = torch . log_softmax (self. fc (torch. cat ((out[ 0 ],x[ 0 ],attn[ 0 ]),dim = 1 )),dim = 1 ) #x: (N, n_input)
return out , h , scores
(26๋ฒ ์ค) Bahdanau Attention์ ์ฌ์ฉํ๋ค. Dot attention๊ณผ๋ ๋ฌ๋ฆฌ t-1 ์์ ์ hidden state๋ฅผ attention์ ๋จผ์ ์ฌ์ฉํ ๋ค ์๋ฒ ๋ฉ ์ถ๋ ฅ๊ณผ ํฉ์ณ GRU์ ๋ฃ๋๋ค. Bahdanau Attention์ score ์์ ๋ค์๊ณผ ๊ฐ๋ค. j๋ฒ์งธ (๋ง์ง๋ง layer) ์ธ์ฝ๋ hidden state์ ๋ํ score์ด๋ค.
s c o r e ( s t โ 1 , h j ) = W c tanh โก ( W a [ s t โ 1 ; h j ] ) = W c tanh โก ( W q s t โ 1 + W k h j ) score(s_{t-1},h_j)=W_c\tanh (W_a[s_{t-1};h_j]) = W_c\tanh (W_q s_{t-1}+W_k h_j) score ( s t โ 1 โ , h j โ ) = W c โ tanh ( W a โ [ s t โ 1 โ ; h j โ ]) = W c โ tanh ( W q โ s t โ 1 โ + W k โ h j โ ) (28๋ฒ ์ค) ๋ฐฐ์น๋ง๋ค ๊ธธ์ด๊ฐ ๋ค๋ฅด๋ฏ๋ก, ์ ํจํ ๋ชจ๋ j์ ๋ํด ์ด๋ฅผ ์ํํ๊ณ softmax์ทจํ๋ค. ์ ํจํ j๋ง ๊ณจ๋ผ๋ด๊ธฐ ์ํด mask๋ฅผ ์
๋ ฅ๋ฐ๋๋ค. ๊ฐ์ ์์ ๋ฌดํ๋๋ก ๋๋ฉด softmax ์ 0์ด ๋๋ค.
(30-34๋ฒ ์ค) ์ด ๊ฐ๋ค๊ณผ ์ธ์ฝ๋ hidden state์ ๊ณฑํ ๊ฒฐ๊ณผ๋ฅผ ์๋ฒ ๋ฉ ๊ฒฐ๊ณผ์ ํฉ์ณ GRU Cell์ ๋ฃ๋๋ค.
(35๋ฒ ์ค) ์ต์ข
์ถ๋ ฅ์ ์ดํ
์
๊ฒฐ๊ณผ, RNN ์ถ๋ ฅ, ์๋ฒ ๋ฉ ์ถ๋ ฅ์ ๋ชจ๋ ํฉ์ณ ๊ณ์ฐํ๋ค.
Seq2Seq ํตํฉ
Copy class Seq2Seq ( nn . Module ):
def __init__ ( self , n_enc_input , n_dec_input , n_hidden , n_layers , dropout = 0.3 ):
super (Seq2Seq,self). __init__ ()
self . n_input = n_enc_input
self . n_output = n_dec_input
self . n_layers = n_layers
self . n_hidden = n_hidden
self . projection = nn . Linear ( 2 * n_hidden,n_hidden)
self . encoder = Encoder (n_enc_input,n_hidden,n_layers,dropout = dropout)
self . decoder = AttnDecoder (n_dec_input,n_hidden,n_layers,dropout = dropout)
def forward ( self , x , y , x_lengths , tf_p = 0.5 ):
# x, y : (L, N)
#prepare
maxlen = y . shape [ 0 ]
batch = y . shape [ 1 ]
h = torch . zeros (self.n_layers * 2 ,batch,self.n_hidden). to (DEVICE)
outputs = torch . zeros (maxlen,batch,self.n_output). to (DEVICE)
mask = (x != 0 ) #(L,N)
mask = mask . to (DEVICE)
#encoder forward
hiddens , h_dec = self . encoder (x,h,x_lengths) # hiddens: [L, N, n_hidden] , h_dec : (2*n_layers, N, n_hidden)
h_dec = torch.tanh(self.projection(torch.cat((h_dec[:self.n_layers],h_dec[self.n_layers:]),dim=2))) # [n_layers,N,n_hidden]
#decoder forward
dec_input = torch . ones (batch). long (). to (DEVICE) * 2 # SOS_token, (N)
for i in range (maxlen):
out , h_dec = self . decoder (dec_input,h_dec,hiddens,mask) # out : [ N, n_dec_input]
outputs [ i ] = out
argmax = out . argmax ( 1 )
tf = random . random () < tf_p
if tf :
dec_input = y [ i ]
else :
dec_input = argmax . int () # [N]
outputs = outputs . transpose ( 0 , 1 ) # [N,L,n_out]
return outputs
**(19-24๋ฒ ์ค) **์ธ์ฝ๋์ ์ด๊ธฐ hidden state์ ๊ฒฐ๊ณผ๋ฅผ ์ ์ฅํ ํ
์๋ฅผ ๋ง๋ ๋ค. ์ธ์ฝ๋์ ์ ํจํ ํ ํฐ ์์น๋ง์ ๋ง์คํนํ๊ธฐ ์ํ mask๋ ์์ฑํ๋ค.
**(27๋ฒ ์ค) **์ธ์ฝ๋๋ฅผ ๊ฑฐ์ณ ์ธ์ฝ๋์ ๋ง์ง๋ง layer์ hidden state๋ค๊ณผ, decoder๋ก ๋๊ฒจ์ค ๋ง์ง๋ง hidden state๋ฅผ ์ ์ฅํ๋ค.
(28๋ฒ ์ค) ์๋ฐฉํฅ ์ธ์ฝ๋์ ๋ง์ง๋ง hidden state๋ ( 2 ร l a y e r s , N , H ) (2\times layers,N,H) ( 2 ร l a yers , N , H ) ์ด๋ฏ๋ก ์ฒซ๋ฒ์งธ ์ฐจ์์ ์ธ ๋ฒ์งธ ์ฐจ์์ผ๋ก concatํด์ ( l a y e r s , N , H ) (layers,N,H) ( l a yers , N , H ) ์ ์ฐจ์์ผ๋ก ํตํฉํด ์ค๋ค. ์ด๋ฅผ ๋์ฝ๋์ ์ฒซ ๋ฒ์งธ hidden state๋ก ๋๊ธด๋ค.
(32๋ฒ ์ค) ๋์ฝ๋์ ์ฒซ ์
๋ ฅ์ ๋ฌธ์ฅ์ ์์์ ๋ํ๋ด์ผ ํ๋ฏ๋ก ํญ์ <s>์ด๋ค.
(34๋ฒ ์ค) target ๋ฌธ์ฅ์ ์ต๋ ๊ธธ์ด๊น์ง timestep์ ๋ฐ๋ณตํ๋ค.
(35-37๋ฒ ์ค) ๋์ฝ๋์ ์
๋ ฅ ํ ์ถ๋ ฅ์ ๊ฐ์ ธ์จ๋ค. ์ถ๋ ฅ๋ hidden state๋ ๋ค์ timestep์ผ๋ก ๋๊ธฐ๊ธฐ ์ํด h_dec์ ๋ค์ ์ ์ฅํ๋ค. ์ถ๋ ฅ์ ์ ์ฅํ๊ณ ์ต๋ ํ๋ฅ ์ ๊ฐ์ง๋ ๋จ์ด์ index๋ฅผ argmax์ ์ ์ฅํ๋ค.
**(39-43๋ฒ ์ค) **๊ต์ฌ ๊ฐ์(Teacher Forcing)์ ํ๋ฅ ์ ์ผ๋ก ์ ์ฉํ๊ธฐ ์ํด ์์ฑํ 0~1 ๋๋ค ์์ ๋ฐ๋ผ ์งํํ๋ค. ๋ง์ฝ ๊ต์ฌ ๊ฐ์๋ฅผ ํ ๊ฒฝ์ฐ, ๋ค์ ๋์ฝ๋ ์
๋ ฅ์ผ๋ก ์ ๋ต ๋จ์ด๋ฅผ ๋ค์ ์
๋ ฅ์ผ๋ก ๋๊ธฐ๊ณ , ์๋ ๊ฒฝ์ฐ ํ์ฌ ์ถ๋ ฅ์ ๋ค์ ์
๋ ฅ์ผ๋ก ๋๊ธด๋ค.