Ko-En Translation

๋ฐ์ดํ„ฐ ์ค€๋น„

๋ฐ์ดํ„ฐ์…‹

AI Hub์˜ ํ•œ๊ตญ์–ด-์˜์–ด ๋ฒˆ์—ญ ๋ง๋ญ‰์น˜(๋ณ‘๋ ฌ) ์‚ฌ์šฉํ•œ๋‹ค.

1_๊ตฌ์–ด์ฒด(1).xlsx์˜ 200,000๊ฐœ์˜ ํ•œ๊ตญ์–ด-์˜์–ด ์Œ์„ ์ด์šฉํ•ด ์–ดํœ˜๋ฅผ ๊ตฌ์„ฑํ•˜๊ณ  ํ•™์Šตํ•œ๋‹ค.

ํŒŒ์‹ฑ

xlsx ํŒŒ์ผ์ด๋ฏ€๋กœ pandas์™€ openpyxl์„ ์„ค์น˜ํ•ด ํŒŒ์‹ฑํ•˜๊ณ , ํ…์ŠคํŠธ ํŒŒ์ผ๋กœ ์ €์žฅํ•œ๋‹ค. ์ €์žฅํ•  ๋•Œ ๋ฏธ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ tokenize ํ›„ ์ €์žฅํ•œ๋‹ค.

# -*- coding: utf-8 -*-
import torchtext
from konlpy.tag import Okt

import pandas as pd
import re

token_ko = Okt().morphs
token_en = torchtext.data.utils.get_tokenizer('basic_english')


lines = pd.read_excel('./1_๊ตฌ์–ด์ฒด(1).xlsx',names=['sid','src','tar'])
del lines['sid']

def textprocess(kot,ent):
    kot = kot.lower().strip()
    kot = re.sub(r"([.!?])", r" \1", kot)
    kot = re.sub(r"[^ใ„ฑ-ใ…Žใ…-ใ…ฃ๊ฐ€-ํžฃ,.!?]",r" ",kot)
    kot = re.sub(r"\s+",r" ",kot)
    kot = ' '.join(token_ko(kot))
    

    ent = ent.lower().strip()
    ent = re.sub(r"([.!?])", r" \1", ent)
    ent = re.sub(r"[^a-zA-Z,.!?]+", r" ", ent)
    ent = re.sub(r"\s+",r" ",ent)
    ent = ' '.join(token_en(ent))
    
    return kot,ent

with open('./kor.txt','w',encoding='utf-8') as ko,open('./eng.txt','w',encoding='utf-8') as en :
    length = len(lines)
    print(f"Total lines = {length}")
    i=0
    for i in lines.index:
        if(i%2000 == 0):
            print(f'{i/length*100:.2f}%')

        text = lines.loc[i]
        kot = text['src']
        ent = text['tar']
        kot,ent=textprocess(kot,ent)

        ko.write(kot)
        ko.write('\n')
        en.write(ent)
        en.write('\n')

๋ฌธ์žฅํ•ด ์กด์žฌํ•˜๋Š” ๋ฌธ์žฅ๋ถ€ํ˜ธ ์•ž์— ๊ณต๋ฐฑ์„ ์ถ”๊ฐ€ํ•˜๊ณ  ํ•œ๊ธ€/์˜์–ด์™€ ํ•„์š”ํ•œ ๋ฌธ์žฅ๋ถ€ํ˜ธ๋งŒ ๋‚จ๊ธด๋‹ค. ์ง€์šฐ๋ฉฐ ์ƒ๊ธด ์—ฌ๋Ÿฌ๊ฐœ์˜ ๊ณต๋ฐฑ์„ ํ•˜๋‚˜์˜ ๊ณต๋ฐฑ์œผ๋กœ ๋ฐ”๊พผ๋‹ค. ๊ทธ ํ›„ tokenizeํ•œ๋‹ค.

์–ดํœ˜์ง‘ / ํ† ํฌ๋‚˜์ด์ €

Sentencepiece๋ฅผ ์‚ฌ์šฉํ•œ๋‹ค. konlpy์˜ Okt์™€ SpaCy๋ฅผ ์‚ฌ์šฉํ•ด ๊ฐ๊ฐ ํ•œ๊ตญ์–ด์™€ ์˜์–ด๋ฅผ tokenizeํ•œ๋‹ค. vocab ์ œ์ž‘์€ torchtext๋ฅผ ์ด์šฉํ•œ๋‹ค.

from konlpy.tag import Okt
import torch
from torchtext.vocab import build_vocab_from_iterator
import torchtext
okt = Okt()
spc = torchtext.data.utils.get_tokenizer('basic_english')


def eniter(path):
    with open(path,encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            line = line.split('\t\t\t\t')[1]
            line = line.lower().strip()
            yield spc(line)
            

def koiter(path):
    
    with open(path,encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            line = line.split('\t\t\t\t')[0]
            line = line.lower().strip()
            yield okt.morphs(line)


vocab_en = build_vocab_from_iterator(eniter('./koreng.txt'),min_freq=10,specials=["<pad>","<unk>","<s>","<\s>"])
vocab_en.set_default_index(vocab_en["<unk>"])
torch.save(vocab_en,'vocab_en.pth')


vocab_ko = build_vocab_from_iterator(koiter('./koreng.txt'),min_freq=10,specials=["<pad>","<unk>","<s>","<\s>"])
vocab_ko.set_default_index(vocab_ko["<unk>"])
torch.save(vocab_ko,'vocab_ko.pth')

ํŠน์ˆ˜ ๊ธฐํ˜ธ๋Š” <pad>, <unk>, <s>, <\s>๊ฐ€ ์žˆ๊ณ  ๊ฐ๊ฐ 0, 1, 2, 3๋ฒˆ์ด๋‹ค. <s>๋Š” ๋ฌธ์žฅ์˜ ์‹œ์ž‘, <\s>๋Š” ๋ฌธ์žฅ์˜ ๋ ํ† ํฐ์ด๋ฉฐ <pad>๋Š” ๊ธธ์ด๊ฐ€ ๋‹ค๋ฅธ ์—ฌ๋Ÿฌ ๋ฌธ์žฅ์„ ๋ณ‘๋ ฌํ™”ํ•˜๊ธฐ ์œ„ํ•ด ๋นˆ ๊ณต๊ฐ„์„ ์ฑ„์šฐ๋Š”๋ฐ ์‚ฌ์šฉํ•œ๋‹ค.

kor.txt์™€ eng.txt๋ฅผ ์ด์šฉํ•ด ์‹คํ–‰ํ•œ ๊ฒฐ๊ณผ ๋‹ค์Œ ํŒŒ์ผ์ด ์ƒ์„ฑ๋œ๋‹ค.

๋ชจ๋ธ ๋งŒ๋“ค๊ธฐ

์–‘๋ฐฉํ–ฅ ์ธ์ฝ”๋”์™€ ์–ดํ…์…˜์„ ์‚ฌ์šฉํ•œ multilayer Seq2seq๋ฅผ ๋งŒ๋“ค ๊ฒƒ์ด๋‹ค.

multilayer์ด๋ฏ€๋กœ ์œ„์˜ ๊ทธ๋ฆผ๊ณผ ๊ฐ™์€ ๊ตฌ์กฐ๊ฐ€ ๋‚˜์˜จ๋‹ค.

์ธ์ฝ”๋”๊ฐ€ ์–‘๋ฐฉํ–ฅ์ด๋ฏ€๋กœ ๋””์ฝ”๋”๋กœ hidden state๋ฅผ ๋„˜๊ธธ ๋•Œ ๋ฐ˜๋“œ์‹œ ์ฐจ์›์„ ์ ˆ๋ฐ˜์œผ๋กœ ์ค„์—ฌ์ฃผ์–ด์•ผ ํ•œ๋‹ค.

์ธ์ฝ”๋”

class Encoder(nn.Module):
    def __init__(self,n_input, n_hidden, n_layers, dropout= 0.3):
        super(Encoder,self).__init__()

        #Params
        self.n_input=n_input #vocab size
        self.n_hidden = n_hidden # embed size / hidden state / rnn output size
        self.n_layers = n_layers

        #Layers
        self.embedding = nn.Embedding(n_input,n_hidden,padding_idx=0)
        self.gru = nn.GRU(n_hidden,n_hidden,n_layers,bidirectional=True,dropout=dropout)
        self.projection = nn.Linear(2*n_hidden,n_hidden)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self,x,h_0,lengths):
        # x : (L,N)
        x = self.dropout(self.embedding(x))

        x = nn.utils.rnn.pack_padded_sequence(x, lengths)
        x,h_t = self.gru(x,h_0) # h_t: (2*n_layers, N, n_hidden), x : (L, N, bi*n_hidden)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x)

        x = torch.tanh(self.projection(torch.cat((x[:,:,self.n_hidden:],x[:,:,:self.n_hidden]),dim=2))) # x : (L, N, n_hidden)

        return x,h_t

GRU Cell์„ ์ด์šฉํ•œ RNN์„ ๋งŒ๋“ ๋‹ค. ์€๋‹‰์ธต๊ณผ ์ž„๋ฒ ๋”ฉ ์ฐจ์›์ด ๊ฐ™๋‹ค๊ณ  ์„ค์ •ํ–ˆ๋‹ค.

(20, 22๋ฒˆ ์ค„)nn.utils.rnn.pack_padded_sequence(x, lengths) ๋ถ€๋ถ„์ด ์žˆ๋Š”๋ฐ, ์ด๋Š” ์—ฌ๋Ÿฌ ๊ฐœ์˜ ๋ฐฐ์น˜๋ฅผ ํ•œ๋ฒˆ์— ์—ฐ์‚ฐํ•  ๋•Œ, <pad> ํ† ํฐ์€ ๊ณ„์‚ฐํ•˜์ง€ ์•Š๋„๋ก ๋งŒ๋“ค์–ด์ค€๋‹ค. ๋‹จ, ์ž…๋ ฅ ๋ฐฐ์น˜์˜ ๊ธธ์ด๋ฅผ ๋‚ด๋ฆผ์ฐจ์ˆœ์œผ๋กœ ์ •๋ ฌํ•ด ์ค„ ํ•„์š”๊ฐ€ ์žˆ์œผ๋ฏ€๋กœ, train ์‹œ ์ฒ˜๋ฆฌํ•œ๋‹ค.

(24๋ฒˆ ์ค„) GRU cell์€ ์ถœ๋ ฅ์œผ๋กœ ๊ทธ timestep์˜ hidden state๋ฅผ ์ถœ๋ ฅํ•œ๋‹ค. ์ด ์ฐจ์›์€ (L,N,2ร—H)(L,N,2\times H)์ด๋‹ค. ํ›„์— ๋””์ฝ”๋”์—์„œ attention์„ ์‹คํ–‰ํ•  ๋•Œ ์ฐจ์›์„ ๋งž์ถ”๊ธฐ ์œ„ํ•ด concatํ•˜๊ณ  ํ†ตํ•ฉํ•œ๋‹ค.

๋งˆ์ง€๋ง‰ hidden state๋Š” ๊ทธ๋Œ€๋กœ ์ถœ๋ ฅํ•œ๋‹ค.

๋””์ฝ”๋”

class AttnDecoder(nn.Module):
    def __init__(self,n_input, n_hidden, n_layers, dropout= 0.3):
        super(AttnDecoder,self).__init__()

        #Params
        self.n_input=n_input
        self.n_hidden=n_hidden
        self.n_layers = n_layers

        #Layers
        self.dropout = nn.Dropout(p=dropout)
        self.embedding = nn.Embedding(n_input,n_hidden,padding_idx=0)
        self.gru = nn.GRU(n_hidden*2,n_hidden,n_layers,dropout=dropout)
        self.fc = nn.Linear(n_hidden*3,n_input)

        #Attention weights
        self.Wq = nn.Linear(n_hidden, n_hidden, bias=False)
        self.Wk = nn.Linear(n_hidden, n_hidden, bias=False)
        self.Wc = nn.Linear(n_hidden, 1 ,bias=False)
        
    def forward(self,x,h_prev,enc_hiddens, mask):

        x = x.unsqueeze(0)
        x = self.dropout(self.embedding(x))    #(1,N) -> (1,N,n_hidden)

        scores = self.Wc(torch.tanh(self.Wq(h_prev[-1].unsqueeze(0)) + self.Wk(enc_hiddens))).squeeze(2)   # (L, N)

        scores =  torch.softmax(torch.masked_fill(scores, mask = (mask == False), value = -float('inf')),dim=0).transpose(0,1).unsqueeze(1) # (N,1,L)

        enc_hiddens = enc_hiddens.transpose(0,1) #(N, L, n_hidden)
        attn = torch.bmm(scores, enc_hiddens).transpose(0,1) # (1, N, n_hidden)

        out,h = self.gru(torch.cat((attn,x),dim=2),h_prev)        #h_t: (n_layers, N, n_hidden) x: (1, N, n_hidden)     

        out=torch.log_softmax(self.fc(torch.cat((out[0],x[0],attn[0]),dim=1)),dim=1) #x: (N, n_input)
       
        return out,h,scores

(26๋ฒˆ ์ค„) Bahdanau Attention์„ ์‚ฌ์šฉํ•œ๋‹ค. Dot attention๊ณผ๋Š” ๋‹ฌ๋ฆฌ t-1 ์‹œ์ ์˜ hidden state๋ฅผ attention์— ๋จผ์ € ์‚ฌ์šฉํ•œ ๋’ค ์ž„๋ฒ ๋”ฉ ์ถœ๋ ฅ๊ณผ ํ•ฉ์ณ GRU์— ๋„ฃ๋Š”๋‹ค. Bahdanau Attention์˜ score ์‹์€ ๋‹ค์Œ๊ณผ ๊ฐ™๋‹ค. j๋ฒˆ์งธ (๋งˆ์ง€๋ง‰ layer) ์ธ์ฝ”๋” hidden state์— ๋Œ€ํ•œ score์ด๋‹ค.

score(stโˆ’1,hj)=Wctanhโก(Wa[stโˆ’1;hj])=Wctanhโก(Wqstโˆ’1+Wkhj)score(s_{t-1},h_j)=W_c\tanh (W_a[s_{t-1};h_j]) = W_c\tanh (W_q s_{t-1}+W_k h_j)

(28๋ฒˆ ์ค„) ๋ฐฐ์น˜๋งˆ๋‹ค ๊ธธ์ด๊ฐ€ ๋‹ค๋ฅด๋ฏ€๋กœ, ์œ ํšจํ•œ ๋ชจ๋“  j์— ๋Œ€ํ•ด ์ด๋ฅผ ์ˆ˜ํ–‰ํ•˜๊ณ  softmax์ทจํ•œ๋‹ค. ์œ ํšจํ•œ j๋งŒ ๊ณจ๋ผ๋‚ด๊ธฐ ์œ„ํ•ด mask๋ฅผ ์ž…๋ ฅ๋ฐ›๋Š”๋‹ค. ๊ฐ’์„ ์Œ์˜ ๋ฌดํ•œ๋Œ€๋กœ ๋‘๋ฉด softmax ์‹œ 0์ด ๋œ๋‹ค.

(30-34๋ฒˆ ์ค„) ์ด ๊ฐ’๋“ค๊ณผ ์ธ์ฝ”๋” hidden state์— ๊ณฑํ•œ ๊ฒฐ๊ณผ๋ฅผ ์ž„๋ฒ ๋”ฉ ๊ฒฐ๊ณผ์™€ ํ•ฉ์ณ GRU Cell์— ๋„ฃ๋Š”๋‹ค.

(35๋ฒˆ ์ค„) ์ตœ์ข… ์ถœ๋ ฅ์€ ์–ดํ…์…˜ ๊ฒฐ๊ณผ, RNN ์ถœ๋ ฅ, ์ž„๋ฒ ๋”ฉ ์ถœ๋ ฅ์„ ๋ชจ๋‘ ํ•ฉ์ณ ๊ณ„์‚ฐํ•œ๋‹ค.

Seq2Seq ํ†ตํ•ฉ

class Seq2Seq(nn.Module):
    def __init__(self,n_enc_input,n_dec_input, n_hidden, n_layers, dropout= 0.3):
        super(Seq2Seq,self).__init__()
        
        self.n_input = n_enc_input
        self.n_output = n_dec_input
        self.n_layers = n_layers
        self.n_hidden=n_hidden

        self.projection = nn.Linear(2*n_hidden,n_hidden)

        self.encoder = Encoder(n_enc_input,n_hidden,n_layers,dropout=dropout)
        self.decoder = AttnDecoder(n_dec_input,n_hidden,n_layers,dropout=dropout)

    def forward(self,x,y,x_lengths,tf_p=0.5):

        # x, y : (L, N)
        #prepare
        maxlen = y.shape[0]
        batch = y.shape[1]
        h = torch.zeros(self.n_layers*2,batch,self.n_hidden).to(DEVICE)
        outputs = torch.zeros(maxlen,batch,self.n_output).to(DEVICE)
        mask = (x != 0) #(L,N)
        mask = mask.to(DEVICE)

        #encoder forward
        hiddens, h_dec = self.encoder(x,h,x_lengths)  # hiddens: [L, N, n_hidden] , h_dec : (2*n_layers, N, n_hidden)
        h_dec =  torch.tanh(self.projection(torch.cat((h_dec[:self.n_layers],h_dec[self.n_layers:]),dim=2))) # [n_layers,N,n_hidden]


        #decoder forward
        dec_input = torch.ones(batch).long().to(DEVICE) * 2 # SOS_token, (N)

        for i in range(maxlen):
            out,h_dec = self.decoder(dec_input,h_dec,hiddens,mask) # out : [ N, n_dec_input]
            outputs[i]=out
            argmax = out.argmax(1) 
            
            tf = random.random()<tf_p
            if tf :
                dec_input = y[i]
            else:
                dec_input = argmax.int() # [N]
        
        outputs = outputs.transpose(0,1) # [N,L,n_out]
        
        return outputs

**(19-24๋ฒˆ ์ค„) **์ธ์ฝ”๋”์˜ ์ดˆ๊ธฐ hidden state์™€ ๊ฒฐ๊ณผ๋ฅผ ์ €์žฅํ•  ํ…์„œ๋ฅผ ๋งŒ๋“ ๋‹ค. ์ธ์ฝ”๋”์˜ ์œ ํšจํ•œ ํ† ํฐ ์œ„์น˜๋งŒ์„ ๋งˆ์Šคํ‚นํ•˜๊ธฐ ์œ„ํ•œ mask๋„ ์ƒ์„ฑํ•œ๋‹ค.

**(27๋ฒˆ ์ค„) **์ธ์ฝ”๋”๋ฅผ ๊ฑฐ์ณ ์ธ์ฝ”๋”์˜ ๋งˆ์ง€๋ง‰ layer์˜ hidden state๋“ค๊ณผ, decoder๋กœ ๋„˜๊ฒจ์ค„ ๋งˆ์ง€๋ง‰ hidden state๋ฅผ ์ €์žฅํ•œ๋‹ค.

(28๋ฒˆ ์ค„) ์–‘๋ฐฉํ–ฅ ์ธ์ฝ”๋”์˜ ๋งˆ์ง€๋ง‰ hidden state๋Š” (2ร—layers,N,H)(2\times layers,N,H) ์ด๋ฏ€๋กœ ์ฒซ๋ฒˆ์งธ ์ฐจ์›์„ ์„ธ ๋ฒˆ์งธ ์ฐจ์›์œผ๋กœ concatํ•ด์„œ (layers,N,H)(layers,N,H)์˜ ์ฐจ์›์œผ๋กœ ํ†ตํ•ฉํ•ด ์ค€๋‹ค. ์ด๋ฅผ ๋””์ฝ”๋”์˜ ์ฒซ ๋ฒˆ์งธ hidden state๋กœ ๋„˜๊ธด๋‹ค.

(32๋ฒˆ ์ค„) ๋””์ฝ”๋”์˜ ์ฒซ ์ž…๋ ฅ์€ ๋ฌธ์žฅ์˜ ์‹œ์ž‘์„ ๋‚˜ํƒ€๋‚ด์•ผ ํ•˜๋ฏ€๋กœ ํ•ญ์ƒ <s>์ด๋‹ค.

(34๋ฒˆ ์ค„) target ๋ฌธ์žฅ์˜ ์ตœ๋Œ€ ๊ธธ์ด๊นŒ์ง€ timestep์„ ๋ฐ˜๋ณตํ•œ๋‹ค.

(35-37๋ฒˆ ์ค„) ๋””์ฝ”๋”์— ์ž…๋ ฅ ํ›„ ์ถœ๋ ฅ์„ ๊ฐ€์ ธ์˜จ๋‹ค. ์ถœ๋ ฅ๋œ hidden state๋Š” ๋‹ค์Œ timestep์œผ๋กœ ๋„˜๊ธฐ๊ธฐ ์œ„ํ•ด h_dec์— ๋‹ค์‹œ ์ €์žฅํ•œ๋‹ค. ์ถœ๋ ฅ์„ ์ €์žฅํ•˜๊ณ  ์ตœ๋Œ€ ํ™•๋ฅ ์„ ๊ฐ€์ง€๋Š” ๋‹จ์–ด์˜ index๋ฅผ argmax์— ์ €์žฅํ•œ๋‹ค.

**(39-43๋ฒˆ ์ค„) **๊ต์‚ฌ ๊ฐ•์š”(Teacher Forcing)์„ ํ™•๋ฅ ์ ์œผ๋กœ ์ ์šฉํ•˜๊ธฐ ์œ„ํ•ด ์ƒ์„ฑํ•œ 0~1 ๋žœ๋ค ์ˆ˜์— ๋”ฐ๋ผ ์ง„ํ–‰ํ•œ๋‹ค. ๋งŒ์•ฝ ๊ต์‚ฌ ๊ฐ•์š”๋ฅผ ํ•  ๊ฒฝ์šฐ, ๋‹ค์Œ ๋””์ฝ”๋” ์ž…๋ ฅ์œผ๋กœ ์ •๋‹ต ๋‹จ์–ด๋ฅผ ๋‹ค์Œ ์ž…๋ ฅ์œผ๋กœ ๋„˜๊ธฐ๊ณ , ์•„๋‹ ๊ฒฝ์šฐ ํ˜„์žฌ ์ถœ๋ ฅ์„ ๋‹ค์Œ ์ž…๋ ฅ์œผ๋กœ ๋„˜๊ธด๋‹ค.

Last updated