# 필요한 nltk library download
import nltk #영어 텍스트에 대한 전처리를 진행할 때는 nltk를 주로 사용
nltk.download('punkt')
nltk.download('webtext')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\samsung\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!

True


para = "Hello everyone. It's good to see you. Let's start our text mining class!"


from nltk.tokenize import sent_tokenize #문장토큰화는 nltk의 sent_tokenize를 사용
print(sent_tokenize(para)) #주어진 text를 sentence 단위로 tokenize함. -> 주로 . ! ? 등을 이용

['Hello everyone.', "It's good to see you.", "Let's start our text mining class!"]


paragraph_french = """Je t'ai demandé si tu m'aimais bien, Tu m'a répondu non. 
Je t'ai demandé si j'étais jolie, Tu m'a répondu non. 
Je t'ai demandé si j'étai dans ton coeur, Tu m'a répondu non."""

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/french.pickle') #프랑스 언어에 대해 사전학습된 모델 지정하기
print(tokenizer.tokenize(paragraph_french))

["Je t'ai demandé si tu m'aimais bien, Tu m'a répondu non.", "Je t'ai demandé si j'étais jolie, Tu m'a répondu non.", "Je t'ai demandé si j'étai dans ton coeur, Tu m'a répondu non."]


para_kor = "안녕하세요, 여러분. 만나서 반갑습니다. 이제 텍스트마이닝 클래스를 시작해봅시다!"


print(sent_tokenize(para_kor)) #한국어에 대해서도 sentence tokenizer는 잘 동작함(하지만 NLTK에서는 한국어에 대해 사전학습된 모델은 아직 없다)

['안녕하세요, 여러분.', '만나서 반갑습니다.', '이제 텍스트마이닝 클래스를 시작해봅시다!']


from nltk.tokenize import word_tokenize #단어 토큰화를 할 때 word_tokenize를 이용
print(word_tokenize(para)) #주어진 text를 word 단위로 tokenize함

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']


from nltk.tokenize import WordPunctTokenizer
print(WordPunctTokenizer().tokenize(para))

['Hello', 'everyone', '.', 'It', "'", 's', 'good', 'to', 'see', 'you', '.', 'Let', "'", 's', 'start', 'our', 'text', 'mining', 'class', '!']


print(word_tokenize(para_kor))

['안녕하세요', ',', '여러분', '.', '만나서', '반갑습니다', '.', '이제', '텍스트마이닝', '클래스를', '시작해봅시다', '!']


import re #정규표현식 지원하는 라이브러리 
re.findall("[abc]", "How are you, boy?") # [] : 대괄호안에 들어있는 문자와 텍스트를 매칭. 만약 하나라도 일치하는 문자가 있으면 다 가져온다

['a', 'b']


re.findall("[0123456789]", "3a7b5c9d") #숫자들을 찾고싶을 때 ([0123456789]로 써도 되지만 [0-9]라고 써도 된다)

['3', '7', '5', '9']


# 만약 알파벳과 숫자 그리고 '_'까지 검색하고 싶으면 [a-zA-z0-9_]로 사용해도 되지만 이에 대한 줄임표현인 [\w]로 간단하게 사용
re.findall("[\w]", "3a 7b_ '.^&5c9d")

['3', 'a', '7', 'b', '_', '5', 'c', '9', 'd']


#특정 문자가 한번이상 반복되는 부분을 찾고 싶으면 +를 이용
#+는 한 번 이상의 반복을 의미
re.findall("[_]+", "a_b, c__d, e___f")#"_"가 한번 이상 반복되는 부분을 찾아서 출력 : [_]+

['_', '__', '___']


re.findall("[\w]+", "How are you, boy?")#[\w] : 특수문자를 포함 안 함-> +와 같이 사용하면 단어 토큰화와 비슷한 결과를 출력

['How', 'are', 'you', 'boy']


re.findall("[o]{2,4}", "oh, hoow are yoooou, boooooooy?") #boooooooy는 o가 7개라 4개 3개로 분리되서 출력이 된다.

['oo', 'oooo', 'oooo', 'ooo']


#NLTK에서는 정규표현식을 사용하는 토크나이저를 regexpTokenizer로 제공
#RegexpTokenizer()함수의 인수로 원하는 정규표현식을 주면 그에 따라 토큰화를 수행
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer("[\w']+") #regular expression(정규식)을 이용한 tokenizer
#단어단위로 tokenize \w:문자나 숫자를 의미 즉 문자나 숫자 혹은 '가 반복되는 것을 찾아냄
print(tokenizer.tokenize("Sorry, I can't go there."))
# can't를 하나의 단어로 인식

['Sorry', 'I', "can't", 'go', 'there']


tokenizer = RegexpTokenizer("[\w]+")  #위에와 달리 "'"(어퍼스트로피)가 없어서 can't를 can/t로 분리
print(tokenizer.tokenize("Sorry, I can't go there."))

['Sorry', 'I', 'can', 't', 'go', 'there']


text1 = "Sorry, I can't go there."
tokenizer = RegexpTokenizer("[\w']{3,}") #문자나 숫자 그리고 '가 3번 이상 반복되는 문자열을 결과값으로 반환 
#-> 즉 go는 2번이므로 출력이 안된다
print(tokenizer.tokenize(text1.lower()))

['sorry', "can't", 'there']


from nltk.corpus import stopwords #일반적으로 분석대상이 아닌 단어들
english_stops = set(stopwords.words('english')) #반복이 되지 않도록 set으로 변환

text1 = "Sorry, I couldn't go to movie yesterday."

tokenizer = RegexpTokenizer("[\w']+")
tokens = tokenizer.tokenize(text1.lower()) #word_tokenize로 토큰화, lower()는 영어를 소문자로 치환

result = [word for word in tokens if word not in english_stops] #stopwords를 제외한 단어들만으로 list를 생성
#tokens에 있는 단어들에 대해서 english_stops에 포함되지 않은 것들만 그대로 반환해라 
print(result)

['sorry', 'go', 'movie', 'yesterday']


print(english_stops) #nltk가 제공하는 영어 stopword를 확인

{'below', 'down', 'while', 'on', 'so', 'haven', "mustn't", 'ourselves', 'hers', 'if', 'how', 'too', 'its', 've', 'a', 'being', 's', 'in', 'here', 'don', 'your', "didn't", 'over', "won't", 'y', 'theirs', 're', 'myself', 'no', 'shan', 'above', 'weren', 'they', 'what', "shan't", 'him', 'or', 'by', 'did', "couldn't", 'having', "doesn't", 'had', 'at', 'there', 'is', 'into', 'it', "aren't", 'himself', 'was', 'mustn', 'off', 'have', 'isn', 'can', 'as', 'against', 'm', 'some', 'through', 'this', 'wasn', 'under', 'just', 'again', 'o', 'wouldn', 'very', 'his', 'who', 'until', "wouldn't", 'more', 'couldn', 'now', 'doesn', "she's", 'only', "you're", 'been', 'to', 'not', 'be', 'are', 'were', 'when', 'same', 'their', 'that', 'which', 'then', 'with', "don't", 'aren', 'all', 'during', 'each', 'ours', 'where', 't', 'because', 'yours', 'my', "it's", 'between', 'itself', 'those', "mightn't", 'the', 'after', 'for', 'few', "hadn't", "wasn't", 'ma', "shouldn't", 'yourselves', 'she', 'didn', 'i', 'than', "that'll", 'am', "haven't", 'you', 'of', 'and', 'doing', 'herself', 'has', 'he', 'mightn', 'needn', 'them', 'won', 'hadn', 'an', 'does', 'do', 'why', "you've", 'about', "weren't", "you'd", 'ain', 'most', 'any', 'nor', 'up', 'hasn', 'from', 'out', 'own', 'our', "you'll", 'whom', 'should', 'we', 'themselves', 'other', 'such', "hasn't", 'shouldn', 'yourself', 'will', 'before', "isn't", "needn't", 'both', 'd', "should've", 'me', 'once', 'these', 'further', 'but', 'her', 'll'}


#자신만의 stopwords를 만들고 이용
#한글처리에서도 유용하게 사용할 수 있음
my_stopword = ['i', 'go', 'to'] #나만의 stopword를 리스트로 정의
result = [word for word in tokens if word not in my_stopword] 
print(result)

['sorry', "couldn't", 'movie', 'yesterday']


from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
print(stemmer.stem('cooking')#어간추출(cooking이 cook의 변형이니까 cook을 추출)
      , stemmer.stem('cookery')#
      , stemmer.stem('cookbooks'))#복수형 명사를 단수형으로 변환(이것도 영어에서는 어간 추출에 포함)

cook cookeri cookbook


#토큰화와 결합해서 어간 추출
from nltk.tokenize import word_tokenize

para = "Hello everyone. It's good to see you. Let's start our text mining class!"
tokens = word_tokenize(para) #토큰화 실행
print(tokens)
result = [stemmer.stem(token) for token in tokens] #모든 토큰에 대해 스테밍 실행
print(result)

['Hello', 'everyone', '.', 'It', "'s", 'good', 'to', 'see', 'you', '.', 'Let', "'s", 'start', 'our', 'text', 'mining', 'class', '!']
['hello', 'everyon', '.', 'it', "'s", 'good', 'to', 'see', 'you', '.', 'let', "'s", 'start', 'our', 'text', 'mine', 'class', '!']


from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()
print(stemmer.stem('cooking'), stemmer.stem('cookery'), stemmer.stem('cookbooks'))

cook cookery cookbook


from nltk.stem import WordNetLemmatizer #WordNetLemmatizer을 통해 표제어 추출 작업 수행
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('cooking')) #cooking이라는 단어는 사전에 명사로 정의 -> cooking자체가 단어의 기본형으로 인식
print(lemmatizer.lemmatize('cooking', pos='v')) #품사를 지정
print(lemmatizer.lemmatize('cookery'))#cookery도 마찬가지
print(lemmatizer.lemmatize('cookbooks'))#cookbooks는 cookbook으로 사전에 정의됨->cookbook으로 결과값을 반환

cooking
cook
cookery
cookbook


#comparison of lemmatizing and stemming
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print('stemming result:', stemmer.stem('believes'))#believ만 추출(believing, believed -> believ가 어간.)
print('lemmatizing result:', lemmatizer.lemmatize('believes'))#명사의 복수형으로 간주 (belief->believes)
print('lemmatizing result:', lemmatizer.lemmatize('believes', pos='v'))#pos를 동사로 지정-> belive의 3인칭 형태로 간주

stemming result: believ
lemmatizing result: belief
lemmatizing result: believe


import nltk
from nltk.tokenize import word_tokenize

tokens = word_tokenize("Hello everyone. It's good to see you. Let's start our text mining class!")
print(nltk.pos_tag(tokens))

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


#nltk.help.upenn_tagset('a'): 품사 a의 약어에 대한 설명을 보여줌
#nltk는 펜 트리뱅크 태그 집합을 사용
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


#원하는 품사에 해당하는 단어만 추출
my_tag_set = ['NN', 'VB', 'JJ']
my_words = [word for word, tag in nltk.pos_tag(tokens) if tag in my_tag_set]
print(my_words)

['everyone', 'good', 'see', 'Let', 'start', 'text', 'mining', 'class']


words_nothings=[word for word in nltk.pos_tag(tokens)]
print(words_nothings)

[('Hello', 'NNP'), ('everyone', 'NN'), ('.', '.'), ('It', 'PRP'), ("'s", 'VBZ'), ('good', 'JJ'), ('to', 'TO'), ('see', 'VB'), ('you', 'PRP'), ('.', '.'), ('Let', 'VB'), ("'s", 'POS'), ('start', 'VB'), ('our', 'PRP$'), ('text', 'NN'), ('mining', 'NN'), ('class', 'NN'), ('!', '.')]


words_with_tag = ['/'.join(item) for item in nltk.pos_tag(tokens)]
print(words_with_tag)

['Hello/NNP', 'everyone/NN', './.', 'It/PRP', "'s/VBZ", 'good/JJ', 'to/TO', 'see/VB', 'you/PRP', './.', 'Let/VB', "'s/POS", 'start/VB', 'our/PRP$', 'text/NN', 'mining/NN', 'class/NN', '!/.']


sentence = '''절망의 반대가 희망은 아니다.
어두운 밤하늘에 별이 빛나듯
희망은 절망 속에 싹트는 거지
만약에 우리가 희망함이 적다면
그 누가 세상을 비출어줄까.
정희성, 희망 공부'''


tokens = word_tokenize(sentence) #한글에 대해서 word_tokenize적용하면 띄어쓰기를 기준으로 토큰화 진행
print(tokens)
print('\n')
print(nltk.pos_tag(tokens))

['절망의', '반대가', '희망은', '아니다', '.', '어두운', '밤하늘에', '별이', '빛나듯', '희망은', '절망', '속에', '싹트는', '거지', '만약에', '우리가', '희망함이', '적다면', '그', '누가', '세상을', '비출어줄까', '.', '정희성', ',', '희망', '공부']


[('절망의', 'JJ'), ('반대가', 'NNP'), ('희망은', 'NNP'), ('아니다', 'NNP'), ('.', '.'), ('어두운', 'VB'), ('밤하늘에', 'JJ'), ('별이', 'NNP'), ('빛나듯', 'NNP'), ('희망은', 'NNP'), ('절망', 'NNP'), ('속에', 'NNP'), ('싹트는', 'NNP'), ('거지', 'NNP'), ('만약에', 'NNP'), ('우리가', 'NNP'), ('희망함이', 'NNP'), ('적다면', 'NNP'), ('그', 'NNP'), ('누가', 'NNP'), ('세상을', 'NNP'), ('비출어줄까', 'NNP'), ('.', '.'), ('정희성', 'NN'), (',', ','), ('희망', 'NNP'), ('공부', 'NNP')]


from konlpy.tag import Okt
t = Okt() #Okt는 Twiiter의 클래스


print('형태소:', t.morphs(sentence))
print()
print('명사:', t.nouns(sentence))
print()
print('품사 태깅 결과:', t.pos(sentence))

형태소: ['절망', '의', '반대', '가', '희망', '은', '아니다', '.', '\n', '어', '두운', '밤하늘', '에', '별', '이', '빛나듯', '\n', '희망', '은', '절망', '속', '에', '싹트는', '거지', '\n', '만약', '에', '우리', '가', '희망', '함', '이', '적다면', '\n', '그', '누가', '세상', '을', '비출어줄까', '.', '\n', '정희성', ',', '희망', '공부']

명사: ['절망', '반대', '희망', '어', '두운', '밤하늘', '별', '희망', '절망', '속', '거지', '만약', '우리', '희망', '함', '그', '누가', '세상', '정희성', '희망', '공부']

품사 태깅 결과: [('절망', 'Noun'), ('의', 'Josa'), ('반대', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('은', 'Josa'), ('아니다', 'Adjective'), ('.', 'Punctuation'), ('\n', 'Foreign'), ('어', 'Noun'), ('두운', 'Noun'), ('밤하늘', 'Noun'), ('에', 'Josa'), ('별', 'Noun'), ('이', 'Josa'), ('빛나듯', 'Verb'), ('\n', 'Foreign'), ('희망', 'Noun'), ('은', 'Josa'), ('절망', 'Noun'), ('속', 'Noun'), ('에', 'Josa'), ('싹트는', 'Verb'), ('거지', 'Noun'), ('\n', 'Foreign'), ('만약', 'Noun'), ('에', 'Josa'), ('우리', 'Noun'), ('가', 'Josa'), ('희망', 'Noun'), ('함', 'Noun'), ('이', 'Josa'), ('적다면', 'Verb'), ('\n', 'Foreign'), ('그', 'Noun'), ('누가', 'Noun'), ('세상', 'Noun'), ('을', 'Josa'), ('비출어줄까', 'Verb'), ('.', 'Punctuation'), ('\n', 'Foreign'), ('정희성', 'Noun'), (',', 'Punctuation'), ('희망', 'Noun'), ('공부', 'Noun')]

Chapter 2. 텍스트 전처리¶

텍스트 전처리: 텍스트에서 불필요한 특수문자(문장부호)등을 제거하고 각 단어의 "품사"까지 파악하는 것¶

전처리 단계¶

2. 토큰화(Tokenization)¶

NLTK (https://www.nltk.org/) 설치¶

2.1 문장 토큰화(sentence tokenize)¶

2.2 단어 토큰화 (word tokenize)¶

2.3 정규표현식을 이용한 토큰화¶

2.4 노이즈와 불용어 제거¶

2. 정규화(Normalization)¶

같은 의미로 쓰인 동일한 단어임에도 불구하고 다른 형태로 쓰여진 단어들을 통일해 표준 단어로 만드는 작업¶

즉, 다양한 변형을 원형의 형태로 통일하는 작업 = 정규화¶

* 방법에 따라 어간추출과 표제어 추출로 나뉘어짐¶

2.1 어간 추출(Stemming)¶

포터스테머(PorterStemmer)¶

랑캐스터 스티머(Lancaster Stemmer)¶

2.2 표제어 추출(Lemmatization)¶

이때 정확한 기본형을 알기 위해서는 품사를 알아야 하는데, 품사는 문장의 문맥을 파악해야만 알 수 있다¶

3. 품사 태깅(Part-of-Speech Tagging)¶

3.1 품사의 이해¶

3.2 NLTK를 이용한 품사 태깅¶

3.3 한글 형태소 분석과 품사 태깅¶

KoNLPy 설치¶