0814 embedding으로 단어 연관성 예측하기

# 필요한 라이브러리 부름
import tensorflow as tf
import numpy as np
from keras.utils import np_utils 
import matplotlib.pyplot as plt

# 그래프 리셋
tf.reset_default_graph() 
# 재현성을 위해 시드 지정
tf.set_random_seed(1) 

# 매개변수 지정
# 학습률: 0.1
# 반복 수: 10000
# 임베딩 차원: 2
# 윈도우 크기: 1 
# -> 다음에 올 단어의 관계 ( 연관, 유사도 )
learning_rate=0.1
nepochs=10000
embedding_dim=2
window_size = 1

# 워드 임베딩에 적용할 문장
text ="King is a brave man Queen is a beautiful woman"

# 소문자로 변환
text = text.lower()

# 간단한 불용어와 문자 그리고 숫자 제거
word_seq = []
for word in text.split():
    if ((word != '.') & (word not in '0123456789')& (word not in ['a','is', 'the'] )):
        word_seq.append(word)
        
word_seq

# 고유한 단어들로 만든 집합
unique_words = set(word_seq) 
n_unique_words = len(unique_words)
unique_words

# 단어와 정수 매핑
word_to_int = {w: i for i, w in enumerate(unique_words)} ## 단어의 대한 인티저로
int_to_word = {i: w for i, w in enumerate(unique_words)}
print(word_to_int)
print(int_to_word)

# 훈련에 사용될 데이터 [input, target] 만듬              
data = []
for i in range(1, len(word_seq) - 1):
    # [input, target] = [neighbors, target]
    target = word_seq[i]
    print("target : ",target)
    neighbor=[]
    for j in range(window_size):
        neighbor.append(word_seq[i - j-1]) ## 타겟을 중심으로 왼쪽 오른쪽 연관단어
        print(neighbor)
        neighbor.append(word_seq[i +  j+ 1])
        print(neighbor)
        
    for w in neighbor:
        data.append([w, target])
print("data : ", data)

# 원-핫 벡터로 변환
x_train = [] 
y_train = [] 

## 연관성을 주기 위함
for w in data:
    x_train.append(np_utils.to_categorical(word_to_int[w[0]] , n_unique_words)) # 네이버
    y_train.append(np_utils.to_categorical(word_to_int[w[1]] , n_unique_words)) # 타겟
    
print(x_train)
print(y_train)

# 넘파이 arrary로 변환
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
print(x_train)
print(y_train)

# 훈련에 사용될 placeholder
X = tf.placeholder(tf.float32, shape=(None, n_unique_words)) ## 몇개의 샘플을 줄지 아직 모르니까 None
Y = tf.placeholder(tf.float32, shape=(None, n_unique_words))

# 입력층과 은닉층의 가중치
# random_normal 랜덤으로 표준분포에 따라 값을 배정하고 싶을 때 사용
W1 = tf.Variable(tf.random_normal([n_unique_words, embedding_dim]))
print("w1 : " ,W1.get_shape())
b1 = tf.Variable(tf.random_normal([embedding_dim]))
print("b1 : " ,b1.get_shape())
# 은닉층 값
hidden_representation = tf.add(tf.matmul(X,W1), b1) # (X x W1) + b
print(hidden_representation.get_shape())
# 은닉층과 출력층의 가중치
W2 = tf.Variable(tf.random_normal([embedding_dim, n_unique_words]))
print("W2 : " ,b1.get_shape())
b2 = tf.Variable(tf.random_normal([n_unique_words]))
print("b2 : " ,b1.get_shape())

# 출력값
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))
print(prediction.get_shape())

# 손실함수 
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(prediction), reduction_indices=[1]))

# optimizer 정의
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy_loss)

#------------------------------------------------
# 텐서플로 그래프 생성 및 학습 
#------------------------------------------------
sess = tf.Session() ## 세션열고
init = tf.global_variables_initializer() ## 초기화
sess.run(init) 
losses=[] ## 손실값
for epoch in range(nepochs):
    sess.run(train_step, feed_dict={X: x_train, Y: y_train})
    loss=sess.run(cross_entropy_loss, feed_dict={X: x_train, Y: y_train})
    if epoch%100==0: ## 100개의 샘플마다
        print('epoch={}, loss = {}' .format(epoch, loss))
    losses.append(loss)

'딥러닝' 카테고리의 다른 글

0819 강화학습 [아타리게임(과일받기)] (0)	2019.08.19
0814 RNN 구조로 뒤에올 문자 예측하기 (0)	2019.08.14
0813 ResNet 프로젝트 (0)	2019.08.13
0813 CNN으로 MNIST 분류기 구현하기 (0)	2019.08.13
0812 ANN을 이용한 MNIST 숫자 분류기 구현 (0)	2019.08.12

'딥러닝' 카테고리의 다른 글

티스토리툴바