반응형
pbar.py
import pyprind
import pandas as pd
import os
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}

df = pd.DataFrame()
'''
for s in ('test', 'train'):
for l in ('pos', 'neg'):
path = './aclImdb/%s/%s' % (s, l)
for file in os.listdir(path):
with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index=True)
pbar.update()

df.columns = ['review', 'sentiment']

import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)
'''
df = pd.read_csv('./movie_data.csv')
print('df.head(3):\n', df.head(3))
# pbar 실행

# 단어를 피처 벡터로 변환
import pandas as pd
import numpy as np
df = pd.DataFrame()
df = pd.read_csv('./movie_data.csv')
print('df.head(3):\n', df.head(3))

from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
'The sun is shining',
'The weather is sweet',
'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)

print(count.vocabulary_)
print(bag.toarray())

'''
nd는 문서의 전체 개수, df(d,t)는 용어 t를 포함하는 문서 d의 개수
'''
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

print(df.loc[0, 'review'][-50:])

import re
def preprocessor(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
return text

print(preprocessor(df.loc[0, 'review'][-50:]))
print(preprocessor("</a>This :) is :( a test :-)!"))

# 모든 영화 리뷰에 적용
df['review'] = df['review'].apply(preprocessor)

# 문서를 토큰으로 처리하기
def tokenizer(text):
return text.split()
print('tokenizer:\n', tokenizer('running like running and thus they run'))

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
return [porter.stem(word) for word in text.split()]

print('tokenizer_porter("runners like running and thus they run"):\n'
, tokenizer_porter('runners like running and thus they run'))

'''
불용어(stop-word) : 모든 종류의 텍스트에서 공통으로 많이 사용되는 단어들로 문서의 다른 종류들을 구별하는 데
유용할 만한 정보를 거의 가지고 있지 않은 (혹은 아주 조금만 가지고 있는) 경우를 말한다.
불용어의 예로는 is, and, has 같은 것들이 있다.
'''
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
[print(w) for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

# 문서 분류를 위한 로지스틱 회귀 모델 훈련
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

# GridSearchCV 오브젝트를 사용하여 5-폴드(5-fold) 충화 교차검증을
# 사용하는 이번 로지스틱 회귀 모델에 대한 최적의 파라미터 세트를 찾아보자
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1,1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]},
{'vect__ngram_range': [(1,1)],
'vect__stop_words': [stop, None],
'vect__tokenizer': [tokenizer, tokenizer_porter],
'vect__use_idf': [False],
'vect__norm': [None],
'clf__penalty': ['l1', 'l2'],
'clf__C': [1.0, 10.0, 100.0]}]
lr_tfidf = Pipeline([('vect', tfidf),
('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)
print('Best parameter set : %s ' % gs_lr_tfidf.best_params_)

print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
text = re.sub('<[^>]*>', '', text)
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
tokenized = [w for w in text.split() if w not in stop]
return tokenized

# generator 함수 stream_docs를 정의해서 한 번에 문서 하나를 읽어들이고 반환시키도록 한다.
def stream_docs(path):
with open(path, 'r', encoding='utf-8') as csv:
next(csv)
for line in csv:
text, label = line[:-3], int(line[-2])
yield text, label

# 테스트로 movie_data.csv 파일의 첫 번째 문서를 읽어보자
print(next(stream_docs(path='./machinelearning/movie_data.csv')))

# stream_docs 함수로부터 문서 스트림을 읽어들이고 size파라미터에 특정 문서의 숫자를 반환하는
# get_minibatch 함수를 정의
def get_minibatch(doc_stream, size):
docs, y = [], []
try:
for _ in range(size):
text, label = next(doc_stream)
docs.append(text)
y.append(label)
except StopIteration:
return None, None
return docs, y

from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./machinelearning/movie_data.csv')

import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if not X_train:
break
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
pbar.update()

X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuarcy: %.3f' % clf.score(X_test, y_test))

clf = clf.partial_fit(X_test, y_test)


반응형

+ Recent posts