import re
import string
def clean_text(tweet):
tweet = tweet.lower() # text lowercase
tweet = re.sub('@[^\s]+', '', tweet) # remove usernames
tweet = re.sub('\[.*?\]', '', tweet) # remove square brackets
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', '', tweet) # remove URLs
tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet) # remove punctuation
tweet = re.sub('\w*\d\w*', '', tweet)
tweet = re.sub('[‘’“”…]', '', tweet)
tweet = re.sub('\n', '', tweet)
return tweet
tweet = lambda x: clean_text(x)
Stopwords
from nltk.corpus import stopwords
additional = ['rt','rts','retweet']
swords = set().union(stopwords.words('english'), additional)
data_clean1 = pd.DataFrame(data_clean['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (swords)])))
data_clean1
Tokenization
tokens = pd.DataFrame(data_clean1['tweet'].apply(nltk.word_tokenize))
tokens.head()
How to use lemma on tweet stored in tokens DataFrame ?