import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import seaborn as sns
import textwrap
import warnings
import re
warnings.filterwarnings('ignore')
DataSet Structure¶
| Sentence | Cleaned Final Tags |
|---|---|
| This is hate sentence 1: | "Hate tag 1" |
| This is hate sentence 2: | "Hate tag 2, Hate tag 3" |
| This is hate sentence 3: | "Hate tag 1, Hate tag 2, Hate tag 3" |
| This is hate sentence 4: | "Hate tag 4, Hate tag 5" |
df=pd.read_csv(r"C:\Users\PMLS\Downloads\Notebook Data\Roman_hindi_withTag.csv")
Our dataset has 6 tags that are:
- Derogatory Remarks
- Ideological Intolerance
- Sex-Based Hate
- Faith-Based Intolerance
- Ethnic Hate
- Neutral
- and the Combinations of all the hate tags , as this is Multi hate
`
df['Cleaned Final Tags'].nunique()
31
Data Cleaning (Tags and Sentences)¶
un1=df['Cleaned Final Tags'].unique()
unique_tags = un1.tolist()
unique_tags_series = pd.Series(unique_tags)
un1 # these will be use later
array(['Neutral',
'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
'Ideological Intolerance, Sex-Based Hate,Derogatory Remarks',
'Derogatory Remarks, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance',
'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
'Derogatory Remarks, Ideological Intolerance, Faith-Based Intolerance',
'Faith-Based Intolerance, Sex-Based Hate',
'Faith-Based Intolerance',
'Derogatory Remarks, Ideological Intolerance',
'Ethnic Hate, Faith-Based Intolerance',
'Ideological Intolerance, Faith-Based Intolerance', 'Sex-Bas',
'Ideological Intolerance, Ethnic Hate', 'Ideological Intolerance',
'Ideological Intolerance, Sex-Based Hate', 'Sex-Based Hate',
'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
'Derogatory Remarks, Ideological Intolerance, Faith-Based Intolerance, Sex-Based Hate',
'Sex-Based Hate, Derogatory Remarks', ', Sex-Based Hate',
'Derogatory Remarks, Ideological Intolerance, Ethnic Hate',
'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
'Ideological Intolerance, Ethnic Hate, Faith-Based Intolerance',
'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
'Ethnic Hate, Sex-Based Hate',
'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
'Ideological Intolerance, Faith-Based Intolerance, Sex-Based Hate'],
dtype=object)
- We are going to Analyze these Unique tags in Depth , Manually or I usally use help of ChatGpt in such task
Analysis of Inconsistencies in Tagging Process¶
1. Order and Spacing Issues¶
Order Issues: During the tagging process, some data entries have the same tags but are ordered inconsistently, such as:
Derogatory Remarks, Sex-Based HateSex-Based Hate, Derogatory Remarks
These should be standardized to maintain consistency.
Spacing Issues: Tags with inconsistent spacing are treated differently, even though they represent the same categories. Examples include:
Derogatory Remarks,Sex-Based HateDerogatory Remarks, Sex-Based Hate
Proper spacing should be ensured to avoid such discrepancies.
2. General Mistakes in Tags¶
Incomplete or Misspelled Tags: There are instances where tags are incomplete or contain spelling mistakes. Some examples are:
Sex-Basinstead ofSex-Based Hate, Sex-Based Hate(an extra comma at the beginning)
These errors should be corrected to ensure the accuracy of the tagging process.
Fixing Order and Spacing Issues¶
- All our Order and Spacing issues will be fixed by using this simple few lines function , Basically by using a simple Regular expression
def clean_and_sort_tags(tag):
if pd.isna(tag):
return tag # nan won't be reomved, but returned
tag = tag.strip()
tag = re.sub(r'\s*,\s*', ', ', tag)
parts = sorted(set(part.strip() for part in tag.split(', ')))
return ', '.join(parts)
unique_tags = un1.tolist()
unique_tags_series = pd.Series(unique_tags)
cleaned_tags = unique_tags_series.apply(clean_and_sort_tags)
tag_mapping = dict(zip(unique_tags, cleaned_tags))
#Applhin this on our Cleaned Final tags column Now
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].apply(lambda x: tag_mapping.get(x, clean_and_sort_tags(x)))
df['Cleaned Final Tags'].nunique()
29
- Our Count is Decreased from 31 to 29 ,
- we will fix the identified Mistakes issues Now , To Fix the tagging
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].str.replace(r'\bSex-Bas\b', 'Sex-Based Hate', regex=True)
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].str.replace(r'^,\s*Sex-Based Hate\b', 'Sex-Based Hate', regex=True)
df['Cleaned Final Tags'].nunique()
27
Sentence CleanUp¶
def clean_tweet(tweet):
# Remove "RT"
tweet = re.sub(r'\bRT\b', '', tweet)
# Remove newlines and tabs
tweet = tweet.replace('\n', ' ').replace('\t', ' ')
# Remove URLs
tweet = re.sub(r'http\S+', '', tweet)
# Remove hashtags
tweet = re.sub(r'#\S+', '', tweet)
# Remove mentions
tweet = re.sub(r'@\S+', '', tweet)
# Remove extra spaces
tweet = re.sub(r'\s+', ' ', tweet).strip()
return tweet
#WE will aplly this on this new colun
df['Clean Sentence'] = df['Sentence'].apply(clean_tweet)
Visualizations¶
Tag Distribution Analysis¶
import matplotlib.pyplot as plt
tags = ['Derogatory Remarks', 'Sex-Based Hate', 'Faith-Based Intolerance',
'Ideological Intolerance', 'Ethnic Hate', 'Neutral']
tag_counts = {tag: 0 for tag in tags}
for tags_list in df['Cleaned Final Tags']:
for tag in tags_list.split(','):
tag = tag.strip() # For removing whitespace lead/trail
if tag in tag_counts:
tag_counts[tag] += 1
tag_counts = dict(sorted(tag_counts.items(), key=lambda item: item[1], reverse=True))
# Calculate the total number of sentences
total_sentences = sum(tag_counts.values())
plt.figure(figsize=(10, 5))
bars = plt.bar(tag_counts.keys(), tag_counts.values(), color='#2F4F4F')
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height}', ha='center', va='bottom')
max_count = max(tag_counts.values())
plt.text(len(tags) - 2, max_count * 1.1, f'Total Sentences: {total_sentences}',
ha='center', va='bottom', fontsize=12, color='black', fontweight='bold')
plt.xlabel('Tags')
plt.ylabel('Count')
plt.title('Count of Unique Tags in Cleaned Final Tags')
plt.xticks(rotation=45)
plt.ylim(0, max_count * 1.2)
plt.tight_layout()
plt.show()
Lets Analyze the Distrubution of plane Hate Based Sentences Now¶
df['Cleaned Final Tags'].unique()
array(['Neutral',
'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
'Derogatory Remarks, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance',
'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance',
'Faith-Based Intolerance, Sex-Based Hate',
'Faith-Based Intolerance',
'Derogatory Remarks, Ideological Intolerance',
'Ethnic Hate, Faith-Based Intolerance',
'Faith-Based Intolerance, Ideological Intolerance',
'Sex-Based Hate', 'Ethnic Hate, Ideological Intolerance',
'Ideological Intolerance',
'Ideological Intolerance, Sex-Based Hate',
'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate',
'Derogatory Remarks, Ethnic Hate, Ideological Intolerance',
'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
'Ethnic Hate, Faith-Based Intolerance, Ideological Intolerance',
'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
'Ethnic Hate, Sex-Based Hate',
'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
'Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate'],
dtype=object)
tags = ['Neutral', # we extracted this By our our Unique() method again
'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
'Derogatory Remarks, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance',
'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance',
'Faith-Based Intolerance, Sex-Based Hate',
'Faith-Based Intolerance',
'Derogatory Remarks, Ideological Intolerance',
'Ethnic Hate, Faith-Based Intolerance',
'Faith-Based Intolerance, Ideological Intolerance',
'Sex-Based Hate', 'Ethnic Hate, Ideological Intolerance',
'Ideological Intolerance',
'Ideological Intolerance, Sex-Based Hate',
'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate',
'Derogatory Remarks, Ethnic Hate, Ideological Intolerance',
'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
'Ethnic Hate, Faith-Based Intolerance, Ideological Intolerance',
'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
'Ethnic Hate, Sex-Based Hate',
'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
'Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate']
tag_counts_individual_sets = {tag: 0 for tag in tags}
# Split tags in 'Cleaned Final Tags' and count occurrences of each tag
for tags_list in df['Cleaned Final Tags']:
tags_list = tags_list.strip() # Remove leading/trailing spaces
if tags_list in tag_counts_individual_sets:
tag_counts_individual_sets[tags_list] += 1
tag_counts_df = pd.DataFrame(list(tag_counts_individual_sets.items()), columns=['Label', 'Count'])
tag_counts_df=tag_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
# we will add a secondy sort too
tag_counts_df['Label_Count'] = tag_counts_df['Label'].apply(lambda x: len(x.split(',')))
# Sort by 'Count' (primary) in descending order and 'Label_Count' (secondary) in ascending order
sorted_df = tag_counts_df.sort_values(by=['Count', 'Label_Count'], ascending=[False, True]).reset_index(drop=True)
tag_counts_df = tag_counts_df[tag_counts_df['Label'] != 'Neutral']
plt.style.use('ggplot')
plt.figure(figsize=(15, 8))
plt.bar(tag_counts_df['Label'], tag_counts_df['Count'], color='#DB7093', edgecolor='black')
plt.xlabel('Tags')
plt.ylabel('Count')
plt.title('Count of Unique Sets and Individual Tags in Cleaned Final Tags')
plt.xticks(rotation=90)
plt.show()
- Well Lets try to make this Visual Better and more appealing
- We will make Abbreveate the naming conventions
- Derogatory Remarks: DR
- Sex-Based Hate: SBH
- Faith-Based Intolerance: FBI
- Ideological Intolerance: II
- Ethnic Hate: EH
- All combinations of them will follow the same pattern by combining these abbreviations.
base_labels = {
'Derogatory Remarks': 'DR',
'Sex-Based Hate': 'SBH',
'Faith-Based Intolerance': 'FBI',
'Ideological Intolerance': 'II',
'Ethnic Hate': 'EH',
'Neutral': 'N'
}
def generate_custom_label(tag, base_labels):
individual_tags = [tag.strip() for tag in tag.split(',')]
# here we Map each ag to its corresponding abbreviation
abbreviations = [base_labels.get(t, t) for t in individual_tags]
return ', '.join(abbreviations)
custom_labels = {tag: generate_custom_label(tag, base_labels) for tag in tags}
for label in tag_counts_df['Label'].unique():
if label not in custom_labels:
custom_labels[label] = label
tag_counts_df = tag_counts_df[tag_counts_df['Label'] != 'Neutral']
tag_counts_df = tag_counts_df.sort_values(by='Count', ascending=False)
plt.style.use('ggplot')
plt.figure(figsize=(15, 8))
bars = plt.barh(tag_counts_df['Label'], tag_counts_df['Count'], color='#FF7F50', edgecolor='black')
for bar in bars:
width = bar.get_width()
plt.text(width + 5, bar.get_y() + bar.get_height() / 2, f'{int(width)}', ha='center', va='center', fontsize=10, color='black')
plt.xlabel('Count')
plt.ylabel('Tags')
plt.title('Count of Unique Sets and Individual Tags in Cleaned Final Tags')
plt.yticks(ticks=range(len(tag_counts_df['Label'])), labels=[custom_labels[label] for label in tag_counts_df['Label']])
legend_elements = [
Line2D([0], [0], color='red', lw=4, label='Derogatory Remarks: DR'),
Line2D([0], [0], color='blue', lw=4, label='Sex-Based Hate: SBH'),
Line2D([0], [0], color='green', lw=4, label='Faith-Based Intolerance: FBI'),
Line2D([0], [0], color='purple', lw=4, label='Ideological Intolerance: II'),
Line2D([0], [0], color='orange', lw=4, label='Ethnic Hate: EH'),
Line2D([0], [0], color='gray', lw=4, label='All Combinations')
]
plt.legend(handles=legend_elements, loc='upper right', title='Abbrevated names')
plt.tight_layout()
plt.show()
- Using this we can find that Outlier Domography Remarks ,Hiding Behind
- we will add one Similar Visual , to see this kind of distribution
total_hate_sentences = tag_counts_df['Count'].sum()
prop_counts = tag_counts_df['Count'] / total_hate_sentences * 100
prop_counts = pd.concat([pd.Series([100]), prop_counts], ignore_index=True)
labels = tag_counts_df['Label']
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(15, 8))
ax.barh('Total Hate Sentences', 100, color='lightgrey', edgecolor='black', left=-50)
for i, (label, count) in enumerate(zip(labels, prop_counts)):
ax.barh(label, count, color='#008080', edgecolor='black', left=-count/2)
for i, count in enumerate(prop_counts):
ax.text(count / 2, i, f'{count:.2f}%', ha='left', va='center', fontsize=8, color='black')
ax.text(0, 'Total Hate Sentences', f'Total: {total_hate_sentences}', ha='center', va='center', fontsize=10, color='black', fontweight='bold')
ax.set_xlabel('Percentage of Total Hate Sentences')
ax.set_title('Proportional Contribution to Total Hate Sentences')
ax.grid(False)
ax.set_xlim(-50, 60)
plt.tight_layout()
plt.show()
Normal vs All Hate labels¶
normal_count = sorted_df[sorted_df['Label'] == 'Neutral']['Count'].values[0]
hate_labels_count = sorted_df[sorted_df['Label'] != 'Normal']['Count'].sum()
new_data = {
'Label': ['Neutral', 'Hate_Labels_Comb'],
'Count': [normal_count, hate_labels_count]
}
new_df = pd.DataFrame(new_data)
new_df
def autopct_format(pct, allvals):
absolute = int(round(pct/100.*sum(allvals)))
return "{:.1f}%\n({:d})".format(pct, absolute)
plt.figure(figsize=(3, 3))
plt.pie(new_df['Count'], labels=new_df['Label'], autopct=lambda pct: autopct_format(pct, new_df['Count']),
colors=['#1E90FF', '#FFD700'], startangle=140, wedgeprops={'edgecolor': 'black'})
plt.title("Neutral vs All Hate Labels Combined")
plt.show()
Next, We Will Analyze the Sentences and Their Contextual Relevance¶
Word Frequency Analysis ( WordClouds)¶
!pip install Pillow
Requirement already satisfied: Pillow in c:\users\pmls\anaconda3\lib\site-packages (10.2.0)
[notice] A new release of pip is available: 24.1.2 -> 24.2 [notice] To update, run: python.exe -m pip install --upgrade pip
from wordcloud import WordCloud, STOPWORDS ,ImageColorGenerator
from PIL import Image
text = ''.join(df['Clean Sentence'])
wc = WordCloud()
wc.generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
with open(r"C:\Users\PMLS\Downloads\RemovalAfterComb",mode='r', encoding='utf-8') as file:
text=file.read()
custom_mask=np.array(Image.open(r"C:\Users\PMLS\Downloads\comment.png"))
#We wil remove these remaing stopwrods too , so to see the more clear picture
custom_stopwords = {'nahi', 'nhi', 'ne', 'ye', 'b', 'h', 'ke', 'hu', 'a', 'hi', 'ni', 'rt'}
#Appearnce Related stuff
wc = WordCloud(
background_color= 'White',
mask =custom_mask,
stopwords=custom_stopwords)
wc.generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
LetsGenrate this for Each of Our Labels now¶
df = pd.read_csv(r"C:\Users\PMLS\Downloads\Train_RushholdCombined.csv", usecols=[1, 8])
custom_mask = np.array(Image.open(r"C:\Users\PMLS\Downloads\comment.png").resize((300, 300))) # Resize the mask
custom_stopwords = {'nahi', 'nhi', 'ne', 'ye', 'b', 'h', 'ke', 'hu', 'a', 'hi', 'ni', 'rt', 'hai', 'ko', 'ki', 'aur', 'hy', 'se','k','ka','tu'}
replacement_dict = {
'Abusive/Offensive': 'Derogatory Remarks',
'Political Hate': 'Ideological Intolerance',
'Sexism': 'Sex-Based Hate',
'Religious Hate': 'Faith-Based Intolerance',
'Racism': 'Ethnic Hate',
'Normal': 'Neutral'
}
def color_func(word, *args, **kwargs):
return 'black' # Set all words to be black
def generate_wordcloud_grid(df, custom_mask, custom_stopwords, replacement_dict):
tag_column = 'Cleaned Final Tags' # Use the column name as a string
all_tags = set()
for tags_list in df[tag_column].dropna():
tags = [tag.strip() for tag in tags_list.split(',')]
all_tags.update(tags)
grid_size = int(np.ceil(np.sqrt(len(all_tags))))
fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
axes = axes.flatten()
tag_sentences_dict = {}
for tag in all_tags:
sentences = df[df[tag_column].apply(lambda tags: tag in [t.strip() for t in tags.split(',')])]['Sentence']
tag_sentences_dict[tag] = ' '.join(sentences)
for i, tag in enumerate(all_tags):
tag_text = tag_sentences_dict.get(tag, '')
wc = WordCloud(
background_color='white',
mask=custom_mask,
stopwords=custom_stopwords,
max_words=100,
contour_width=3,
contour_color='black',
color_func=color_func # Set the color function
).generate(tag_text)
axes[i].imshow(wc, interpolation='bilinear')
axes[i].axis('off')
subtitle = replacement_dict.get(tag, tag)
axes[i].text(0.5, -0.1, subtitle, ha='center', va='center', transform=axes[i].transAxes, fontsize=12, fontweight='bold', color='black') # Subtitle color
# Hide unused axes
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
generate_wordcloud_grid(df, custom_mask, custom_stopwords, replacement_dict)
Word Co_Occurence Analysis¶
import nltk
from nltk import bigrams
import networkx as nx
import itertools
- Here the width of the Edges (Red line) Signifies COunt ,Higer repetation =Thicker line
file_path = r"C:\Users\PMLS\Downloads\RemovalAfterComb.txt"
with open(file_path, 'r', encoding='utf-8') as file:
sentences = file.readlines()
def extract_bigrams(text):
tokens = word_tokenize(text)
return list(bigrams(tokens))
all_bigrams = [bigram for sentence in sentences for bigram in extract_bigrams(sentence)]
# Count the occurrences of each bigram
bigram_freq = Counter(all_bigrams)
# Get the 20 most common bigrams
common_bigrams = bigram_freq.most_common(20)
bigram_df = pd.DataFrame(common_bigrams, columns=['bigram', 'count'])
G = nx.Graph()
# Add edges to the graph
for _, row in bigram_df.iterrows():
bigram = row['bigram']
count = row['count']
G.add_edge(bigram[0], bigram[1], weight=count)
pos = nx.spring_layout(G, k=0.89, iterations=50)
plt.figure(figsize=(12, 7))
plt.grid(False)
nx.draw_networkx_nodes(G, pos, node_size=500, node_color='skyblue', alpha=0.7)
nx.draw_networkx_edges(G, pos, width=[d['weight']*0.1 for (u, v, d) in G.edges(data=True)], alpha=0.5, edge_color="red")
nx.draw_networkx_labels(G, pos, font_size=6) # Add this line to draw node labels with smaller font size
plt.title('Network of Top 20 Most Common Bigrams', fontsize=16)
plt.show()
model_Processed_dict_TFIDF_combined = Word2Vec.load("model_Processed_dict_TFIDF_combined_sg.model")
Similarity Analysis & Embeddings¶
For this we have Already Trained out Data on word2 Vec of Gensim Embedding Model ,
we will be Importing it , and performing a Similarity Analysis , based on Comman Word Embeddings
The Model was train on Skipgram with following Hyper Params
Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0, epochs=5)
keys = ['khana', 'talha', 'adalat', 'modi']
embedding_clusters = []
word_clusters = []
for word in keys:
embeddings = []
words = []
for similar_word, _ in model_Processed_dict_TFIDF_combined.wv.most_similar(word, topn=15):
words.append(similar_word)
embeddings.append(model_Processed_dict_TFIDF_combined.wv[similar_word])
embedding_clusters.append(embeddings)
word_clusters.append(words)
embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
plt.figure(figsize=(16, 9))
colors = cm.seismic(np.linspace(0, 1, len(labels)))
for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
x = embeddings[:, 0]
y = embeddings[:, 1]
plt.scatter(x, y, c=color, alpha=a, label=label)
for i, word in enumerate(words):
plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
textcoords='offset points', ha='right', va='bottom', size=8)
plt.legend(loc=4)
plt.title(title)
plt.grid(True)
if filename:
plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
plt.show()
tsne_plot_similar_words('Similar words Based on Embeddings', keys, embeddings_en_2d, word_clusters, 0.7, 'similar_words.png')