# TensorFlow

## 공통 코드

In [1]:
# 파이썬 ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"

# 공통 모듈 임포트
import numpy as np
import os

# 노트북 실행 결과를 동일하게 유지하기 위해
np.random.seed(42)

# 깔끔한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# 그림을 저장할 위치
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "tensorflow"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("그림 저장:", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
import warnings
warnings.filterwarnings(action='ignore')

## TensorFlow 설치

In [2]:
#!pip install tensorflow

## TensorFlow 버전 확인

In [None]:
import tensorflow as tf
print('텐서 플로 버전:', tf.__version__)

## Tensor 와 연산

### Tensor 정의

In [None]:
tf.constant(42) # 스칼라
tf.constant([[1., 2., 3.], [4., 5., 6.]]) # 행렬
t = tf.constant([[1., 2., 3.], [4., 5., 6.]])
t

### 인덱스 참조

In [None]:
print(t[:, 1:])
print()
print(t[..., 1, tf.newaxis])

### 연산

In [None]:
print(t + 10)
print()

print(tf.square(t))
print()

print(t @ tf.transpose(t))

### Keras 의 저수준 API

In [None]:
from tensorflow import keras
K = keras.backend
K.square(K.transpose(t)) + 10

### Tensor 와 numpy

In [None]:
a = np.array([2., 4., 5.])
tf.constant(a)

In [None]:
t.numpy()

In [None]:
np.array(t)

In [None]:
tf.square(a)

In [None]:
np.square(t)

## 타입 변환

### 연산 예외

In [None]:
try:
  tf.constant(2.0) + tf.constant(40)
except tf.errors.InvalidArgumentError as ex:
  print(ex)

### 형 변환

In [None]:
try:
  t2 = tf.constant(40., dtype=tf.float64)
  print(tf.constant(2.0) + tf.cast(t2, tf.float32))
except tf.errors.InvalidArgumentError as ex:
  print(ex)

## 변수

In [None]:
v = tf.Variable([[1., 2., 3.], [4., 5., 6.]])
print(v)

In [None]:
v.assign(2 * v)

In [None]:
try:
  v[1] = [7., 8., 9.]
except TypeError as ex:
  print(ex)

In [None]:
v.scatter_nd_update(indices=[[0,0], [1,2]], updates=[100., 200.])

In [None]:
sparse_delta = tf.IndexedSlices(values=[[1., 2., 3.], [4., 5., 6.]],indices=[1, 0])
v.scatter_update(sparse_delta)

## 함수

In [None]:
@tf.function
def tf_cube(x):
  print("print:", x)
  return x ** 3

result = tf_cube(tf.constant(2.0))

print(result)

result = tf_cube(2)
result = tf_cube(3)
result = tf_cube(tf.constant([[1., 2.]])) # New shape: trace!
result = tf_cube(tf.constant([[3., 4.], [5., 6.]])) # New shape: trace!
result = tf_cube(tf.constant([[7., 8.], [9., 10.], [11., 12.]])) # New shape: trace!

In [None]:
@tf.function
def tf_cube(x):
  print("print:", x)
  for i in x:
    for j in i:
      x += 1
  return x ** 3

result = tf_cube(tf.constant([[3., 4.], [5., 6.]])) # New shape: trace!
result = tf_cube(tf.constant([[7., 8.], [9., 10.], [11., 12.]])) # New shape: trace!

### 난수 생성

In [None]:
rand = tf.random.uniform([1],0,1)
print(rand)

In [None]:
rand = tf.random.normal([4],0,1)
print(rand)

### 뉴런의 생성

In [None]:
# 시그모이드 함수
import math
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [None]:
x = 1
y = 0
w = tf.random.normal([1],0,1)
output = sigmoid(x * w)
print(output)

In [None]:
#경사 하강법을 이용한 뉴런의 학습
for i in range(1000):
    output = sigmoid(x * w)
    error = y - output
    w = w + x * 0.1 * error
    
    if i % 100 == 99:
        print(i, error, output)

In [None]:
#x=0 일 때 y=1 을 얻는 뉴런의 학습
x = 0
y = 1
w = tf.random.normal([1],0,1)

for i in range(1000):
    output = sigmoid(x * w)
    error = y - output
    w = w + x * 0.1 * error
    
    if i % 100 == 99:
        print(i, error, output)

In [None]:
#x=0 일 때 y=1 을 얻는 뉴런의 학습에 편향을 더함
x = 0
y = 1
w = tf.random.normal([1],0,1)
b = tf.random.normal([1],0,1)

for i in range(1000):
    output = sigmoid(x * w + 1 * b)
    error = y - output
    w = w + x * 0.1 * error
    b = b + 1 * 0.1 * error
    
    if i % 100 == 99:
        print(i, error, output)

### AND 연산을 위한 신경망

In [None]:
import numpy as np
x = np.array([[1,1], [1,0], [0,1], [0,0]])
y = np.array([[1], [0], [0], [0]])
w = tf.random.normal([2],0,1)
b = tf.random.normal([1],0,1)
b_x = 1

for i in range(2000):
    error_sum = 0
    for j in range(4):
        output = sigmoid(np.sum(x[j]*w)+b_x*b)
        error = y[j][0] - output
        w = w + x[j] * 0.1 * error
        b = b + b_x * 0.1 * error
        error_sum += error
        
    if i % 200 == 199:
        print(i, error_sum)

In [None]:
for i in range(4):
    print('X:', x[i], 'Y:', y[i], 'Output:', sigmoid(np.sum(x[i]*w)+b))


### OR 연산을 위한 신경망

In [None]:
import numpy as np
x = np.array([[1,1], [1,0], [0,1], [0,0]])
y = np.array([[1], [1], [1], [0]])
w = tf.random.normal([2],0,1)
b = tf.random.normal([1],0,1)
b_x = 1

for i in range(2000):
    error_sum = 0
    for j in range(4):
        output = sigmoid(np.sum(x[j]*w)+b_x*b)
        error = y[j][0] - output
        w = w + x[j] * 0.1 * error
        b = b + b_x * 0.1 * error
        error_sum += error
        
    if i % 200 == 199:
        print(i, error_sum)

In [None]:
for i in range(4):
    print('X:', x[i], 'Y:', y[i], 'Output:', sigmoid(np.sum(x[i]*w)+b))

### XOR를 위한 신경망

In [None]:
import numpy as np
x = np.array([[1,1], [1,0], [0,1], [0,0]])
y = np.array([[0], [1], [1], [0]])
w = tf.random.normal([2],0,1)
b = tf.random.normal([1],0,1)
b_x = 1

for i in range(2000):
    error_sum = 0
    for j in range(4):
        output = sigmoid(np.sum(x[j]*w)+b_x*b)
        error = y[j][0] - output
        w = w + x[j] * 0.1 * error
        b = b + b_x * 0.1 * error
        error_sum += error
        
    if i % 200 == 199:
        print(i, error_sum)

In [None]:
for i in range(4):
    print('X:', x[i], 'Y:', y[i], 'Output:', sigmoid(np.sum(x[i]*w)+b))

In [None]:
print('w:', w)
print('b:', b)

### XOR 문제 해결

In [None]:
import numpy as np
x = np.array([[1,1], [1,0], [0,1], [0,0]])
y = np.array([[0], [1], [1], [0]])

model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=2, activation='sigmoid', input_shape=(2,)),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1), loss='mse')

model.summary()

In [None]:
history = model.fit(x, y, epochs=10000, batch_size=1)

In [None]:
model.predict(x)

In [None]:
for weight in model.weights:
    print(weight)

In [None]:
#2-레이어 XOR 네트워크의 loss 변화를 선 그래프로 표시
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])

## 데이터 로드 처리

### 전체 데이터 셋 메모리 로드

In [None]:
X = tf.range(10)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

#이전과 동일한 역할
dataset = tf.data.Dataset.range(10)

In [None]:
#데이터 순회
for item in dataset:
  print(item)

### 반복 과 배치 처리

In [None]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
  print(item)

### 데이터 변환

In [None]:
dataset = dataset.map(lambda x: x * 2)

for item in dataset:
  print(item)

### 데이터 셔플링

In [None]:
tf.random.set_seed(42)

dataset = tf.data.Dataset.range(10).repeat(3)
dataset = dataset.shuffle(buffer_size=3, seed=42).batch(7)
for item in dataset:
  print(item)

### 데이터 나누기

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_ 

In [None]:
#파일을 나누어서 저장하는 함수
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
  housing_dir = os.path.join("datasets", "housing")
  os.makedirs(housing_dir, exist_ok=True)
  path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")
  filepaths = []
  m = len(data)
  for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
    part_csv = path_format.format(name_prefix, file_idx)
    filepaths.append(part_csv)
    with open(part_csv, "wt", encoding="utf-8") as f:
      if header is not None:
        f.write(header)
        f.write("\n")
        for row_idx in row_indices:
          f.write(",".join([repr(col) for col in data[row_idx]]))
          f.write("\n")
  return filepaths 

In [None]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [None]:
import pandas as pd
#하나의 파일 읽기
pd.read_csv(train_filepaths[0]).head()

In [None]:
with open(train_filepaths[0]) as f:
  for i in range(5):
    print(f.readline(), end="")

In [None]:
# 파일 경로 확인
train_filepaths

In [None]:
#파일 경로 섞기
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [None]:
#파일 경로 확인
for filepath in filepath_dataset:
  print(filepath)

In [None]:
#파일 경로 확인
n_readers = 5
#interleave()를 호출하여 한 번에 다섯 개의 파일을 한 줄 씩 번갈아 읽기
#첫번째 줄은 열 이름으로 건너뛰면서 읽기
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers)

for line in dataset.take(5):
  print(line.numpy())

### 데이터 전처리

In [None]:
record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), "Hello", tf.constant([])]
parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)
parsed_fields

In [None]:
#다섯 번째 필드는 필수(기본값을 tf.constant([])로 지정)이기 때문에 따라서 값을 전달하지 않으면 예외가 발생
try:
  parsed_fields = tf.io.decode_csv(',,,,', record_defaults)
except tf.errors.InvalidArgumentError as ex:
  print(ex)

In [None]:
#필드 개수는 record_defaults에 있는 필드 개수와 정확히 맞아야 함
try:
  parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)
except tf.errors.InvalidArgumentError as ex:
  print(ex)

In [None]:
#전처리를 위한 함수
n_inputs = 8 # X_train.shape[-1]

@tf.function
def preprocess(line):
  defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
  fields = tf.io.decode_csv(line, record_defaults=defs)
  x = tf.stack(fields[:-1])
  y = tf.stack(fields[-1:])
  return (x - X_mean) / X_std, y
  
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

In [None]:
#csv 파일에서 캘리포니아 주택 데이터셋을 효율적으로 적재하고 전처리, 셔플링, 반복, 배치를 적용한 데이터셋을 만들어 반환하는 함수
def csv_reader_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None, shuffle_buffer_size=10000, n_parse_threads=5, batch_size=32):
  dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
  dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), cycle_length=n_readers, num_parallel_calls=n_read_threads)
  dataset = dataset.shuffle(shuffle_buffer_size)
  dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
  dataset = dataset.batch(batch_size)
  return dataset.prefetch(1)


In [None]:
#csv 파일에서 캘리포니아 주택 데이터셋을 효율적으로 적재하고 전처리, 셔플링, 반복, 배치를 적용한 데이터셋을 만들어 반환하는 함수
tf.random.set_seed(42)

train_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in train_set.take(2):
  print("X =", X_batch)
  print("y =", y_batch)
  print()

### tf.keras와 데이터셋 사용

In [None]:
#검증 세트와 테스트 세트로 사용할 데이터셋 생성
train_set = csv_reader_dataset(train_filepaths, repeat=None)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([keras.layers.Dense(30, activation="relu", input_shape=X_train.shape[1:]), keras.layers.Dense(1)])

In [None]:
#csv_reader_dataset() 함수로 훈련 세트로 사용할 데이터셋을 만들 수 있음
model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))

batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10, validation_data=valid_set)

model.evaluate(test_set, steps=len(X_test) // batch_size)

In [None]:
new_set = test_set.map(lambda X, y: X)
X_new = X_test
model.predict(new_set, steps=len(X_new) // batch_size)

In [None]:
optimizer = keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = keras.losses.mean_squared_error

n_epochs = 5
batch_size = 32
n_steps_per_epoch = len(X_train) // batch_size
total_steps = n_epochs * n_steps_per_epoch
global_step = 0
for X_batch, y_batch in train_set.take(total_steps):
  global_step += 1
  print("\rGlobal step {}/{}".format(global_step, total_steps), end="")
  with tf.GradientTape() as tape:
    y_pred = model(X_batch)
    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
    loss = tf.add_n([main_loss] + model.losses)

  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
#훈련을 반복하는 함수
optimizer = keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = keras.losses.mean_squared_error

@tf.function
def train(model, n_epochs, batch_size=32,n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):
  train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers, n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size, n_parse_threads=n_parse_threads, batch_size=batch_size)
  n_steps_per_epoch = len(X_train) // batch_size
  total_steps = n_epochs * n_steps_per_epoch
  global_step = 0
  
  for X_batch, y_batch in train_set.take(total_steps):
    global_step += 1
    if tf.equal(global_step % 100, 0):
      tf.print("\rGlobal step", global_step, "/", total_steps)
    
    with tf.GradientTape() as tape:
      y_pred = model(X_batch)
      main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
      loss = tf.add_n([main_loss] + model.losses)
      
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

In [None]:
train(model, 5)

### TFRecord

In [None]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
  f.write(b"This is the first record")
  f.write(b"And this is the second record")

filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
  print(item)

In [None]:
#기본적으로 한 번에 하나의 파일만 읽지만 num_parallel_reads=3와 같이 지정하면 동시에 3개를 읽고 레코드를 번갈아 반환
filepaths = ["my_test_{}.tfrecord".format(i) for i in range(5)]
for i, filepath in enumerate(filepaths):
  with tf.io.TFRecordWriter(filepath) as f:
    for j in range(3):
      f.write("File {} record {}".format(i, j).encode("utf-8"))

dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=3)
for item in dataset:
  print(item)

### 압축된 TFRecord

In [None]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
  f.write(b"This is the first record")
  f.write(b"And this is the second record")

dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"], compression_type="GZIP")
for item in dataset:
  print(item)

### TensorFlow 데이터셋 프로젝트

In [None]:
!pip install tensorflow_datasets

import tensorflow_datasets as tfds

datasets = tfds.load(name="mnist")
mnist_train, mnist_test = datasets["train"], datasets["test"]


In [None]:
plt.figure(figsize=(6,3))
mnist_train = mnist_train.repeat(5).batch(32).prefetch(1)
for item in mnist_train:
  images = item["image"]
  labels = item["label"]
  for index in range(5):
    plt.subplot(1, 5, index + 1)
    image = images[index, ..., 0]
    label = labels[index].numpy()
    plt.imshow(image, cmap="binary")
    plt.title(label)
    plt.axis("off")
  break # just showing part of the first batch


In [None]:
datasets = tfds.load(name="mnist")
mnist_train, mnist_test = datasets["train"], datasets["test"]
mnist_train = mnist_train.repeat(5).batch(32)
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)
for images, labels in mnist_train.take(1):
  print(images.shape)
  print(labels.numpy())


In [None]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

datasets = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = datasets["train"].repeat().prefetch(1)
model = keras.models.Sequential([keras.layers.Flatten(input_shape=[28, 28, 1]), keras.layers.Lambda(lambda images: tf.cast(images, tf.float32)), keras.layers.Dense(10, activation="softmax")])

model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.SGD(learning_rate=1e-3), metrics=["accuracy"])
model.fit(mnist_train, steps_per_epoch=60000 // 32, epochs=5) 