import os import re import gc import string import unicodedata import operator import numpy as np import pandas as pd
from tqdm import tqdm tqdm.pandas()
""" utils """
def load_data(datapath): print("loading data ......") df_train = pd.read_csv(os.path.join(datapath, "train.csv")) df_test = pd.read_csv(os.path.join(datapath, "test.csv")) print("train data with shape : ", df_train.shape) print("test data with shape : ", df_test.shape) return df_train, df_test
""" nlp """
def clean_misspell(text): """ misspell list (quora vs. glove) """ misspell_to_sub = { 'Terroristan': 'terrorist Pakistan', 'terroristan': 'terrorist Pakistan', 'BIMARU': 'Bihar, Madhya Pradesh, Rajasthan, Uttar Pradesh', 'Hinduphobic': 'Hindu phobic', 'hinduphobic': 'Hindu phobic', 'Hinduphobia': 'Hindu phobic', 'hinduphobia': 'Hindu phobic', 'Babchenko': 'Arkady Arkadyevich Babchenko faked death', 'Boshniaks': 'Bosniaks', 'Dravidanadu': 'Dravida Nadu', 'mysoginists': 'misogynists', 'MGTOWS': 'Men Going Their Own Way', 'mongloid': 'Mongoloid', 'unsincere': 'insincere', 'meninism': 'male feminism', 'jewplicate': 'jewish replicate', 'unoin': 'Union', 'daesh': 'Islamic State of Iraq and the Levant', 'Kalergi': 'Coudenhove-Kalergi', 'Bhakts': 'Bhakt', 'bhakts': 'Bhakt', 'Tambrahms': 'Tamil Brahmin', 'Pahul': 'Amrit Sanskar', 'SJW': 'social justice warrior', 'SJWs': 'social justice warrior', ' incel': ' involuntary celibates', ' incels': ' involuntary celibates', 'emiratis': 'Emiratis', 'weatern': 'western', 'westernise': 'westernize', 'Pizzagate': 'Pizzagate conspiracy theory', 'naïve': 'naive', 'Skripal': 'Sergei Skripal', 'Remainers': 'British remainer', 'remainers': 'British remainer', 'bremainer': 'British remainer', 'antibrahmin': 'anti Brahminism', 'HYPSM': ' Harvard, Yale, Princeton, Stanford, MIT', 'HYPS': ' Harvard, Yale, Princeton, Stanford', 'kompromat': 'compromising material', 'Tharki': 'pervert', 'tharki': 'pervert', 'mastuburate': 'masturbate', 'Zoë': 'Zoe', 'indans': 'Indian', ' xender': ' gender', 'Naxali ': 'Naxalite ', 'Naxalities': 'Naxalites', 'Bathla': 'Namit Bathla', 'Mewani': 'Indian politician Jignesh Mevani', 'clichéd': 'cliche', 'cliché': 'cliche', 'clichés': 'cliche', 'Wjy': 'Why', 'Fadnavis': 'Indian politician Devendra Fadnavis', 'Awadesh': 'Indian engineer Awdhesh Singh', 'Awdhesh': 'Indian engineer Awdhesh Singh', 'Khalistanis': 'Sikh separatist movement', 'madheshi': 'Madheshi', 'BNBR': 'Be Nice, Be Respectful', 'Bolsonaro': 'Jair Bolsonaro', 'XXXTentacion': 'Tentacion', 'Padmavat': 'Indian Movie Padmaavat', 'Žižek': 'Slovenian philosopher Slavoj Žižek', 'Adityanath': 'Indian monk Yogi Adityanath', 'Brexit': 'British Exit', 'Brexiter': 'British Exit supporter', 'Brexiters': 'British Exit supporters', 'Brexiteer': 'British Exit supporter', 'Brexiteers': 'British Exit supporters', 'Brexiting': 'British Exit', 'Brexitosis': 'British Exit disorder', 'brexit': 'British Exit', 'brexiters': 'British Exit supporters', 'jallikattu': 'Jallikattu', 'fortnite': 'Fortnite ', 'Swachh': 'Swachh Bharat mission campaign ', 'Quorans': 'Quoran', 'Qoura ': 'Quora ', 'quoras': 'Quora', 'Quroa': 'Quora', 'QUORA': 'Quora', 'narcissit': 'narcissist', # extra in sample 'Doklam': 'Tibet', 'Drumpf ': 'Donald Trump fool ', 'Drumpfs': 'Donald Trump fools', 'Strzok': 'Hillary Clinton scandal', 'rohingya': 'Rohingya ', 'wumao ': 'cheap Chinese stuff', 'wumaos': 'cheap Chinese stuff', 'Sanghis': 'Sanghi', 'Tamilans': 'Tamils', 'biharis': 'Biharis', 'Rejuvalex': 'hair growth formula', 'Feku': 'The Man of India ', 'deplorables': 'deplorable', 'muhajirs': 'Muslim immigrant', 'Gujratis': 'Gujarati', 'Chutiya': 'Tibet people ', 'Chutiyas': 'Tibet people ', 'thighing': 'masturbate', '卐': 'Nazi Germany', 'Pribumi': 'Native Indonesian', 'Gurmehar': 'Gurmehar Kaur Indian student activist', 'Novichok': 'Soviet Union agents', 'Khazari': 'Khazars', 'Demonetization': 'demonetization', 'demonetisation': 'demonetization', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'cryptocurrencies': 'cryptocurrency', 'Hindians': 'North Indian who hate British', 'vaxxer': 'vocal nationalist ', 'remoaner': 'remainer ', 'bremoaner': 'British remainer ', 'Jewism': 'Judaism', 'Eroupian': 'European', 'WMAF': 'White male married Asian female', 'moeslim': 'Muslim', 'cishet': 'cisgender and heterosexual person', 'Eurocentric': 'Eurocentrism ', 'Jewdar': 'Jew dar', 'Asifa': 'abduction, rape, murder case ', 'marathis': 'Marathi', 'Trumpanzees': 'Trump chimpanzee fool', 'Crimean': 'Crimea people ', 'atrracted': 'attract', 'LGBT': 'lesbian, gay, bisexual, transgender', 'Boshniak': 'Bosniaks ', 'Myeshia': 'widow of Green Beret killed in Niger', 'demcoratic': 'Democratic', 'raaping': 'rape', 'Dönmeh': 'Islam', 'feminazism': 'feminism nazi', 'langague': 'language', 'Hongkongese': 'HongKong people', 'hongkongese': 'HongKong people', 'Kashmirians': 'Kashmirian', 'Chodu': 'fucker', 'penish': 'penis', 'micropenis': 'tiny penis', 'Madridiots': 'Real Madrid idiot supporters', 'Ambedkarite': 'Dalit Buddhist movement ', 'ReleaseTheMemo': 'cry for the right and Trump supporters', 'harrase': 'harass', 'Barracoon': 'Black slave', 'Castrater': 'castration', 'castrater': 'castration', 'Rapistan': 'Pakistan rapist', 'rapistan': 'Pakistan rapist', 'Turkified': 'Turkification', 'turkified': 'Turkification', 'Dumbassistan': 'dumb ass Pakistan', 'facetards': 'Facebook retards', 'rapefugees': 'rapist refugee', 'superficious': 'superficial', # extra from kagglers 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'narcissit': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', 'mastrubating': 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', 'whst': 'what', 'watsapp': 'whatsapp', # extra 'bodyshame': 'body shaming', 'bodyshoppers': 'body shopping', 'bodycams': 'body cams', 'Cananybody': 'Can any body', 'deadbody': 'dead body', 'deaddict': 'de addict', 'Northindian': 'North Indian ', 'northindian': 'north Indian ', 'northkorea': 'North Korea', 'Whykorean': 'Why Korean', 'koreaboo': 'Korea boo ', 'Brexshit': 'British Exit bullshit', 'shithole': ' shithole ', 'shitpost': 'shit post', 'shitslam': 'shit Islam', 'shitlords': 'shit lords', 'Fck': 'Fuck', 'fck': 'fuck', 'Clickbait': 'click bait ', 'clickbait': 'click bait ', 'mailbait': 'mail bait', 'healhtcare': 'healthcare', 'trollbots': 'troll bots', 'trollled': 'trolled', 'trollimg': 'trolling', 'cybertrolling': 'cyber trolling', 'sickular': 'India sick secular ', 'suckimg': 'sucking', 'Idiotism': 'idiotism', 'Niggerism': 'Nigger', 'Niggeriah': 'Nigger' } misspell_re = re.compile('(%s)' % '|'.join(misspell_to_sub.keys()))
def _replace(match): """ reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa """ try: word = misspell_to_sub.get(match.group(0)) except KeyError: word = match.group(0) print('!!Error: Could Not Find Key: {}'.format(word)) return word return misspell_re.sub(_replace, text)
def spacing_misspell(text): """ 'deadbody' -> 'dead body' """ misspell_list = [ '(F|f)uck', 'Trump', '\W(A|a)nti', '(W|w)hy', '(W|w)hat', 'How', 'care\W', '\Wover', 'gender', 'people', ] misspell_re = re.compile('(%s)' % '|'.join(misspell_list)) return misspell_re.sub(r" \1 ", text)
def clean_latex(text): """ convert r"[math]\vec{x} + \vec{y}" to English """ # edge case text = re.sub(r'\[math\]', ' LaTex math ', text) text = re.sub(r'\[\/math\]', ' LaTex math ', text) text = re.sub(r'\\', ' LaTex ', text)
pattern_to_sub = { r'\\mathrm': ' LaTex math mode ', r'\\mathbb': ' LaTex math mode ', r'\\boxed': ' LaTex equation ', r'\\begin': ' LaTex equation ', r'\\end': ' LaTex equation ', r'\\left': ' LaTex equation ', r'\\right': ' LaTex equation ', r'\\(over|under)brace': ' LaTex equation ', r'\\text': ' LaTex equation ', r'\\vec': ' vector ', r'\\var': ' variable ', r'\\theta': ' theta ', r'\\mu': ' average ', r'\\min': ' minimum ', r'\\max': ' maximum ', r'\\sum': ' + ', r'\\times': ' * ', r'\\cdot': ' * ', r'\\hat': ' ^ ', r'\\frac': ' / ', r'\\div': ' / ', r'\\sin': ' Sine ', r'\\cos': ' Cosine ', r'\\tan': ' Tangent ', r'\\infty': ' infinity ', r'\\int': ' integer ', r'\\in': ' in ', } # post process for look up pattern_dict = {k.strip('\\'): v for k, v in pattern_to_sub.items()} # init re patterns = pattern_to_sub.keys() pattern_re = re.compile('(%s)' % '|'.join(patterns))
def _replace(match): """ reference: https://www.kaggle.com/hengzheng/attention-capsule-why-not-both-lb-0-694 # noqa """ try: word = pattern_dict.get(match.group(0).strip('\\')) except KeyError: word = match.group(0) print('!!Error: Could Not Find Key: {}'.format(word)) return word return pattern_re.sub(_replace, text)
def normalize_unicode(text): """ unicode string normalization """ return unicodedata.normalize('NFKD', text)
def remove_newline(text): """ remove \n and \t """ text = re.sub('\n', ' ', text) text = re.sub('\t', ' ', text) text = re.sub('\b', ' ', text) text = re.sub('\r', ' ', text) return text
def decontracted(text): """ de-contract the contraction """ # specific text = re.sub(r"(W|w)on(\'|\’)t", "will not", text) text = re.sub(r"(C|c)an(\'|\’)t", "can not", text) text = re.sub(r"(Y|y)(\'|\’)all", "you all", text) text = re.sub(r"(Y|y)a(\'|\’)ll", "you all", text)
# general text = re.sub(r"(I|i)(\'|\’)m", "i am", text) text = re.sub(r"(A|a)in(\'|\’)t", "is not", text) text = re.sub(r"n(\'|\’)t", " not", text) text = re.sub(r"(\'|\’)re", " are", text) text = re.sub(r"(\'|\’)s", " is", text) text = re.sub(r"(\'|\’)d", " would", text) text = re.sub(r"(\'|\’)ll", " will", text) text = re.sub(r"(\'|\’)t", " not", text) text = re.sub(r"(\'|\’)ve", " have", text) return text
def spacing_punctuation(text): """ add space before and after punctuation and symbols """ regular_punct = list(string.punctuation) extra_punct = [ ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤'] all_punct = ''.join(sorted(list(set(regular_punct + extra_punct)))) re_tok = re.compile(f'([{all_punct}])') return re_tok.sub(r' \1 ', text)
def spacing_digit(text): """ add space before and after digits """ re_tok = re.compile('([0-9])') return re_tok.sub(r' \1 ', text)
def spacing_number(text): """ add space before and after numbers """ re_tok = re.compile('([0-9]{1,})') return re_tok.sub(r' \1 ', text)
def remove_number(text): """ numbers are not toxic """ return re.sub('\d+', ' ', text)
def remove_space(text): """ remove extra spaces and ending space if any """ text = re.sub('\s+', ' ', text) text = re.sub('\s+$', '', text) return text
""" tokenizer """
def preprocess(text, remove_num=True): """ preprocess text into clean text for tokenization
NOTE: 1. glove supports uppper case words 2. glove supports digit 3. glove supports punctuation 5. glove supports domains e.g. www.apple.com 6. glove supports misspelled words e.g. FUCKKK """ # # 1. normalize # text = normalize_unicode(text) # # 2. remove new line # text = remove_newline(text) # 3. de-contract text = decontracted(text) # 4. clean misspell text = clean_misspell(text) # 5. space misspell text = spacing_misspell(text) # 6. clean_latex text = clean_latex(text) # 7. space text = spacing_punctuation(text) # 8. handle number if remove_num: text = remove_number(text) else: text = spacing_digit(text) # 9. remove space text = remove_space(text) return text
|