import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import seaborn as sns 
import textwrap 
import warnings
import re
warnings.filterwarnings('ignore')

df=pd.read_csv(r"C:\Users\PMLS\Downloads\Notebook Data\Roman_hindi_withTag.csv")

df['Cleaned Final Tags'].nunique()

31

un1=df['Cleaned Final Tags'].unique()
unique_tags = un1.tolist()
unique_tags_series = pd.Series(unique_tags)
un1 # these will be use later

array(['Neutral',
       'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
       'Ideological Intolerance, Sex-Based Hate,Derogatory Remarks',
       'Derogatory Remarks, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance',
       'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
       'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ideological Intolerance, Faith-Based Intolerance',
       'Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance',
       'Derogatory Remarks, Ideological Intolerance',
       'Ethnic Hate, Faith-Based Intolerance',
       'Ideological Intolerance, Faith-Based Intolerance', 'Sex-Bas',
       'Ideological Intolerance, Ethnic Hate', 'Ideological Intolerance',
       'Ideological Intolerance, Sex-Based Hate', 'Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
       'Derogatory Remarks, Ideological Intolerance, Faith-Based Intolerance, Sex-Based Hate',
       'Sex-Based Hate, Derogatory Remarks', ', Sex-Based Hate',
       'Derogatory Remarks, Ideological Intolerance, Ethnic Hate',
       'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
       'Ideological Intolerance, Ethnic Hate, Faith-Based Intolerance',
       'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
       'Ethnic Hate, Sex-Based Hate',
       'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
       'Ideological Intolerance, Faith-Based Intolerance, Sex-Based Hate'],
      dtype=object)

def clean_and_sort_tags(tag):
    if pd.isna(tag):
        return tag # nan won't be reomved, but returned 
    tag = tag.strip()
    tag = re.sub(r'\s*,\s*', ', ', tag)
    parts = sorted(set(part.strip() for part in tag.split(', ')))
    return ', '.join(parts)

unique_tags = un1.tolist()
unique_tags_series = pd.Series(unique_tags)

cleaned_tags = unique_tags_series.apply(clean_and_sort_tags)
tag_mapping = dict(zip(unique_tags, cleaned_tags))
#Applhin this on our Cleaned Final tags column Now 
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].apply(lambda x: tag_mapping.get(x, clean_and_sort_tags(x)))

df['Cleaned Final Tags'].nunique()

29

df['Cleaned Final Tags'] = df['Cleaned Final Tags'].str.replace(r'\bSex-Bas\b', 'Sex-Based Hate', regex=True)
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].str.replace(r'^,\s*Sex-Based Hate\b', 'Sex-Based Hate', regex=True)
df['Cleaned Final Tags'].nunique()

27

def clean_tweet(tweet):
    # Remove "RT"
    tweet = re.sub(r'\bRT\b', '', tweet)
    # Remove newlines and tabs
    tweet = tweet.replace('\n', ' ').replace('\t', ' ')
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\S+', '', tweet)
    # Remove mentions
    tweet = re.sub(r'@\S+', '', tweet)
    # Remove extra spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet
#WE will aplly this on this new colun 
df['Clean Sentence'] = df['Sentence'].apply(clean_tweet)

import matplotlib.pyplot as plt


tags = ['Derogatory Remarks', 'Sex-Based Hate', 'Faith-Based Intolerance', 
        'Ideological Intolerance', 'Ethnic Hate', 'Neutral']

tag_counts = {tag: 0 for tag in tags}


for tags_list in df['Cleaned Final Tags']:
    for tag in tags_list.split(','):
        tag = tag.strip()  # For removing whitespace lead/trail
        if tag in tag_counts:
            tag_counts[tag] += 1

tag_counts = dict(sorted(tag_counts.items(), key=lambda item: item[1], reverse=True))

# Calculate the total number of sentences
total_sentences = sum(tag_counts.values())

plt.figure(figsize=(10, 5))
bars = plt.bar(tag_counts.keys(), tag_counts.values(), color='#2F4F4F')

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height}', ha='center', va='bottom')

max_count = max(tag_counts.values())
plt.text(len(tags) - 2, max_count * 1.1, f'Total Sentences: {total_sentences}', 
         ha='center', va='bottom', fontsize=12, color='black', fontweight='bold')

plt.xlabel('Tags')
plt.ylabel('Count')
plt.title('Count of Unique Tags in Cleaned Final Tags')
plt.xticks(rotation=45)
plt.ylim(0, max_count * 1.2)  
plt.tight_layout()
plt.show()

df['Cleaned Final Tags'].unique()

array(['Neutral',
       'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance',
       'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
       'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance',
       'Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance',
       'Derogatory Remarks, Ideological Intolerance',
       'Ethnic Hate, Faith-Based Intolerance',
       'Faith-Based Intolerance, Ideological Intolerance',
       'Sex-Based Hate', 'Ethnic Hate, Ideological Intolerance',
       'Ideological Intolerance',
       'Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
       'Ethnic Hate, Faith-Based Intolerance, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
       'Ethnic Hate, Sex-Based Hate',
       'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate'],
      dtype=object)

tags = ['Neutral', # we extracted  this By our our Unique() method again 
       'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance',
       'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
       'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance',
       'Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance',
       'Derogatory Remarks, Ideological Intolerance',
       'Ethnic Hate, Faith-Based Intolerance',
       'Faith-Based Intolerance, Ideological Intolerance',
       'Sex-Based Hate', 'Ethnic Hate, Ideological Intolerance',
       'Ideological Intolerance',
       'Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
       'Ethnic Hate, Faith-Based Intolerance, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
       'Ethnic Hate, Sex-Based Hate',
       'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate']

tag_counts_individual_sets = {tag: 0 for tag in tags}

# Split tags in 'Cleaned Final Tags' and count occurrences of each tag
for tags_list in df['Cleaned Final Tags']:
    tags_list = tags_list.strip()  # Remove leading/trailing spaces
    if tags_list in tag_counts_individual_sets:
        tag_counts_individual_sets[tags_list] += 1

tag_counts_df = pd.DataFrame(list(tag_counts_individual_sets.items()), columns=['Label', 'Count'])

tag_counts_df=tag_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)


# we will add a secondy sort too 
tag_counts_df['Label_Count'] = tag_counts_df['Label'].apply(lambda x: len(x.split(',')))

# Sort by 'Count' (primary) in descending order and 'Label_Count' (secondary) in ascending order
sorted_df = tag_counts_df.sort_values(by=['Count', 'Label_Count'], ascending=[False, True]).reset_index(drop=True)

tag_counts_df = tag_counts_df[tag_counts_df['Label'] != 'Neutral']
plt.style.use('ggplot')
plt.figure(figsize=(15, 8))
plt.bar(tag_counts_df['Label'], tag_counts_df['Count'], color='#DB7093', edgecolor='black')
plt.xlabel('Tags')
plt.ylabel('Count')
plt.title('Count of Unique Sets and Individual Tags in Cleaned Final Tags')
plt.xticks(rotation=90)
plt.show()

base_labels = {
    'Derogatory Remarks': 'DR',
    'Sex-Based Hate': 'SBH',
    'Faith-Based Intolerance': 'FBI',
    'Ideological Intolerance': 'II',
    'Ethnic Hate': 'EH',
    'Neutral': 'N'
}

def generate_custom_label(tag, base_labels):
    individual_tags = [tag.strip() for tag in tag.split(',')]
    # here we  Map each ag to its corresponding abbreviation
    abbreviations = [base_labels.get(t, t) for t in individual_tags]
    return ', '.join(abbreviations)

custom_labels = {tag: generate_custom_label(tag, base_labels) for tag in tags}

for label in tag_counts_df['Label'].unique():
    if label not in custom_labels:
        custom_labels[label] = label

tag_counts_df = tag_counts_df[tag_counts_df['Label'] != 'Neutral']

tag_counts_df = tag_counts_df.sort_values(by='Count', ascending=False)

plt.style.use('ggplot')
plt.figure(figsize=(15, 8))
bars = plt.barh(tag_counts_df['Label'], tag_counts_df['Count'], color='#FF7F50', edgecolor='black')

for bar in bars:
    width = bar.get_width()
    plt.text(width + 5, bar.get_y() + bar.get_height() / 2, f'{int(width)}', ha='center', va='center', fontsize=10, color='black')

plt.xlabel('Count')
plt.ylabel('Tags')
plt.title('Count of Unique Sets and Individual Tags in Cleaned Final Tags')

plt.yticks(ticks=range(len(tag_counts_df['Label'])), labels=[custom_labels[label] for label in tag_counts_df['Label']])


legend_elements = [
    Line2D([0], [0], color='red', lw=4, label='Derogatory Remarks: DR'),
    Line2D([0], [0], color='blue', lw=4, label='Sex-Based Hate: SBH'),
    Line2D([0], [0], color='green', lw=4, label='Faith-Based Intolerance: FBI'),
    Line2D([0], [0], color='purple', lw=4, label='Ideological Intolerance: II'),
    Line2D([0], [0], color='orange', lw=4, label='Ethnic Hate: EH'),
    Line2D([0], [0], color='gray', lw=4, label='All Combinations')
]
plt.legend(handles=legend_elements, loc='upper right', title='Abbrevated names')

plt.tight_layout()
plt.show()

total_hate_sentences = tag_counts_df['Count'].sum()

prop_counts = tag_counts_df['Count'] / total_hate_sentences * 100
prop_counts = pd.concat([pd.Series([100]), prop_counts], ignore_index=True)


labels = tag_counts_df['Label']

plt.style.use('ggplot') 


fig, ax = plt.subplots(figsize=(15, 8))

ax.barh('Total Hate Sentences', 100, color='lightgrey', edgecolor='black', left=-50)

for i, (label, count) in enumerate(zip(labels, prop_counts)):
    ax.barh(label, count, color='#008080', edgecolor='black', left=-count/2)

for i, count in enumerate(prop_counts):
    ax.text(count / 2, i, f'{count:.2f}%', ha='left', va='center', fontsize=8, color='black')

ax.text(0, 'Total Hate Sentences', f'Total: {total_hate_sentences}', ha='center', va='center', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel('Percentage of Total Hate Sentences')
ax.set_title('Proportional Contribution to Total Hate Sentences')

ax.grid(False)

ax.set_xlim(-50, 60)

plt.tight_layout()
plt.show()

normal_count = sorted_df[sorted_df['Label'] == 'Neutral']['Count'].values[0]
hate_labels_count = sorted_df[sorted_df['Label'] != 'Normal']['Count'].sum()

new_data = {
    'Label': ['Neutral', 'Hate_Labels_Comb'],
    'Count': [normal_count, hate_labels_count]
}

new_df = pd.DataFrame(new_data)
new_df

def autopct_format(pct, allvals):
    absolute = int(round(pct/100.*sum(allvals)))
    return "{:.1f}%\n({:d})".format(pct, absolute)

plt.figure(figsize=(3, 3))
plt.pie(new_df['Count'], labels=new_df['Label'], autopct=lambda pct: autopct_format(pct, new_df['Count']), 
        colors=['#1E90FF', '#FFD700'], startangle=140, wedgeprops={'edgecolor': 'black'})

plt.title("Neutral vs All Hate Labels Combined")
plt.show()

!pip install Pillow

Requirement already satisfied: Pillow in c:\users\pmls\anaconda3\lib\site-packages (10.2.0)

[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip

from wordcloud import WordCloud, STOPWORDS ,ImageColorGenerator
from PIL import Image

text = ''.join(df['Clean Sentence'])

wc = WordCloud()
wc.generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

with open(r"C:\Users\PMLS\Downloads\RemovalAfterComb",mode='r', encoding='utf-8') as file:
    text=file.read()
    
custom_mask=np.array(Image.open(r"C:\Users\PMLS\Downloads\comment.png"))
#We wil remove these remaing stopwrods too , so to see the more clear picture
custom_stopwords = {'nahi', 'nhi', 'ne', 'ye', 'b', 'h', 'ke', 'hu', 'a', 'hi', 'ni', 'rt'}
    #Appearnce Related stuff 
wc = WordCloud(
    background_color= 'White',
    mask =custom_mask,
    stopwords=custom_stopwords)
wc.generate(text)



plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

df = pd.read_csv(r"C:\Users\PMLS\Downloads\Train_RushholdCombined.csv", usecols=[1, 8])
custom_mask = np.array(Image.open(r"C:\Users\PMLS\Downloads\comment.png").resize((300, 300)))  # Resize the mask
custom_stopwords = {'nahi', 'nhi', 'ne', 'ye', 'b', 'h', 'ke', 'hu', 'a', 'hi', 'ni', 'rt', 'hai', 'ko', 'ki', 'aur', 'hy', 'se','k','ka','tu'}

replacement_dict = {
    'Abusive/Offensive': 'Derogatory Remarks',
    'Political Hate': 'Ideological Intolerance',
    'Sexism': 'Sex-Based Hate',
    'Religious Hate': 'Faith-Based Intolerance',
    'Racism': 'Ethnic Hate',
    'Normal': 'Neutral'
}

def color_func(word, *args, **kwargs):
    return 'black'  # Set all words to be black

def generate_wordcloud_grid(df, custom_mask, custom_stopwords, replacement_dict):
    tag_column = 'Cleaned Final Tags'  # Use the column name as a string

    all_tags = set()
    for tags_list in df[tag_column].dropna():
        tags = [tag.strip() for tag in tags_list.split(',')]
        all_tags.update(tags)

    grid_size = int(np.ceil(np.sqrt(len(all_tags))))
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    axes = axes.flatten()
    
    tag_sentences_dict = {}
    for tag in all_tags:
        sentences = df[df[tag_column].apply(lambda tags: tag in [t.strip() for t in tags.split(',')])]['Sentence']
        tag_sentences_dict[tag] = ' '.join(sentences)

    for i, tag in enumerate(all_tags):
        tag_text = tag_sentences_dict.get(tag, '')
        wc = WordCloud(
            background_color='white',
            mask=custom_mask,
            stopwords=custom_stopwords,
            max_words=100,
            contour_width=3,
            contour_color='black',
            color_func=color_func  # Set the color function
        ).generate(tag_text)
        
        axes[i].imshow(wc, interpolation='bilinear')
        axes[i].axis('off')
        
        subtitle = replacement_dict.get(tag, tag)
        
        axes[i].text(0.5, -0.1, subtitle, ha='center', va='center', transform=axes[i].transAxes, fontsize=12, fontweight='bold', color='black')  # Subtitle color

    # Hide unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

generate_wordcloud_grid(df, custom_mask, custom_stopwords, replacement_dict)

import nltk
from nltk import bigrams
import networkx as nx
import itertools

file_path = r"C:\Users\PMLS\Downloads\RemovalAfterComb.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    sentences = file.readlines()

def extract_bigrams(text):
    tokens = word_tokenize(text)
    return list(bigrams(tokens))

all_bigrams = [bigram for sentence in sentences for bigram in extract_bigrams(sentence)]

# Count the occurrences of each bigram
bigram_freq = Counter(all_bigrams)

# Get the 20 most common bigrams
common_bigrams = bigram_freq.most_common(20)

bigram_df = pd.DataFrame(common_bigrams, columns=['bigram', 'count'])

G = nx.Graph()

# Add edges to the graph
for _, row in bigram_df.iterrows():
    bigram = row['bigram']
    count = row['count']
    G.add_edge(bigram[0], bigram[1], weight=count)

pos = nx.spring_layout(G, k=0.89, iterations=50)

plt.figure(figsize=(12, 7))
plt.grid(False)

nx.draw_networkx_nodes(G, pos, node_size=500, node_color='skyblue', alpha=0.7)
nx.draw_networkx_edges(G, pos, width=[d['weight']*0.1 for (u, v, d) in G.edges(data=True)], alpha=0.5, edge_color="red")
nx.draw_networkx_labels(G, pos, font_size=6)  # Add this line to draw node labels with smaller font size

plt.title('Network of Top 20 Most Common Bigrams', fontsize=16)
plt.show()

model_Processed_dict_TFIDF_combined = Word2Vec.load("model_Processed_dict_TFIDF_combined_sg.model")

keys = ['khana', 'talha', 'adalat', 'modi']

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model_Processed_dict_TFIDF_combined.wv.most_similar(word, topn=15):
        words.append(similar_word)
        embeddings.append(model_Processed_dict_TFIDF_combined.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.seismic(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()

tsne_plot_similar_words('Similar words Based on Embeddings', keys, embeddings_en_2d, word_clusters, 0.7, 'similar_words.png')

Sentence	Cleaned Final Tags
This is hate sentence 1:	"Hate tag 1"
This is hate sentence 2:	"Hate tag 2, Hate tag 3"
This is hate sentence 3:	"Hate tag 1, Hate tag 2, Hate tag 3"
This is hate sentence 4:	"Hate tag 4, Hate tag 5"

DataSet Structure¶

Data Cleaning (Tags and Sentences)¶

Analysis of Inconsistencies in Tagging Process¶

1. Order and Spacing Issues¶

2. General Mistakes in Tags¶

Fixing Order and Spacing Issues¶

Sentence CleanUp¶

Visualizations¶

Tag Distribution Analysis¶

Lets Analyze the Distrubution of plane Hate Based Sentences Now¶

Normal vs All Hate labels¶

Next, We Will Analyze the Sentences and Their Contextual Relevance¶

Word Frequency Analysis ( WordClouds)¶

LetsGenrate this for Each of Our Labels now¶

Word Co_Occurence Analysis¶

Similarity Analysis & Embeddings¶