In [138]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import matplotlib.cm as cm
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import seaborn as sns 
import textwrap 
import warnings
import re
warnings.filterwarnings('ignore')
In [ ]:
 

DataSet Structure¶

Sentence Cleaned Final Tags
This is hate sentence 1: "Hate tag 1"
This is hate sentence 2: "Hate tag 2, Hate tag 3"
This is hate sentence 3: "Hate tag 1, Hate tag 2, Hate tag 3"
This is hate sentence 4: "Hate tag 4, Hate tag 5"
In [150]:
df=pd.read_csv(r"C:\Users\PMLS\Downloads\Notebook Data\Roman_hindi_withTag.csv")

Our dataset has 6 tags that are:

  1. Derogatory Remarks
  2. Ideological Intolerance
  3. Sex-Based Hate
  4. Faith-Based Intolerance
  5. Ethnic Hate
  6. Neutral
  • and the Combinations of all the hate tags , as this is Multi hate

`

In [140]:
df['Cleaned Final Tags'].nunique()
Out[140]:
31
In [ ]:
 

Data Cleaning (Tags and Sentences)¶

In [141]:
un1=df['Cleaned Final Tags'].unique()
unique_tags = un1.tolist()
unique_tags_series = pd.Series(unique_tags)
un1 # these will be use later 
Out[141]:
array(['Neutral',
       'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
       'Ideological Intolerance, Sex-Based Hate,Derogatory Remarks',
       'Derogatory Remarks, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance',
       'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
       'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ideological Intolerance, Faith-Based Intolerance',
       'Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance',
       'Derogatory Remarks, Ideological Intolerance',
       'Ethnic Hate, Faith-Based Intolerance',
       'Ideological Intolerance, Faith-Based Intolerance', 'Sex-Bas',
       'Ideological Intolerance, Ethnic Hate', 'Ideological Intolerance',
       'Ideological Intolerance, Sex-Based Hate', 'Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
       'Derogatory Remarks, Ideological Intolerance, Faith-Based Intolerance, Sex-Based Hate',
       'Sex-Based Hate, Derogatory Remarks', ', Sex-Based Hate',
       'Derogatory Remarks, Ideological Intolerance, Ethnic Hate',
       'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
       'Ideological Intolerance, Ethnic Hate, Faith-Based Intolerance',
       'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
       'Ethnic Hate, Sex-Based Hate',
       'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
       'Ideological Intolerance, Faith-Based Intolerance, Sex-Based Hate'],
      dtype=object)
  • We are going to Analyze these Unique tags in Depth , Manually or I usally use help of ChatGpt in such task

Analysis of Inconsistencies in Tagging Process¶

1. Order and Spacing Issues¶

  • Order Issues: During the tagging process, some data entries have the same tags but are ordered inconsistently, such as:

    • Derogatory Remarks, Sex-Based Hate
    • Sex-Based Hate, Derogatory Remarks

    These should be standardized to maintain consistency.

  • Spacing Issues: Tags with inconsistent spacing are treated differently, even though they represent the same categories. Examples include:

    • Derogatory Remarks,Sex-Based Hate
    • Derogatory Remarks, Sex-Based Hate

    Proper spacing should be ensured to avoid such discrepancies.

2. General Mistakes in Tags¶

  • Incomplete or Misspelled Tags: There are instances where tags are incomplete or contain spelling mistakes. Some examples are:

    • Sex-Bas instead of Sex-Based Hate
    • , Sex-Based Hate (an extra comma at the beginning)

    These errors should be corrected to ensure the accuracy of the tagging process.

Fixing Order and Spacing Issues¶

  • All our Order and Spacing issues will be fixed by using this simple few lines function , Basically by using a simple Regular expression
In [142]:
def clean_and_sort_tags(tag):
    if pd.isna(tag):
        return tag # nan won't be reomved, but returned 
    tag = tag.strip()
    tag = re.sub(r'\s*,\s*', ', ', tag)
    parts = sorted(set(part.strip() for part in tag.split(', ')))
    return ', '.join(parts)

unique_tags = un1.tolist()
unique_tags_series = pd.Series(unique_tags)

cleaned_tags = unique_tags_series.apply(clean_and_sort_tags)
tag_mapping = dict(zip(unique_tags, cleaned_tags))
#Applhin this on our Cleaned Final tags column Now 
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].apply(lambda x: tag_mapping.get(x, clean_and_sort_tags(x)))
In [143]:
df['Cleaned Final Tags'].nunique()
Out[143]:
29
  • Our Count is Decreased from 31 to 29 ,
  • we will fix the identified Mistakes issues Now , To Fix the tagging
In [144]:
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].str.replace(r'\bSex-Bas\b', 'Sex-Based Hate', regex=True)
df['Cleaned Final Tags'] = df['Cleaned Final Tags'].str.replace(r'^,\s*Sex-Based Hate\b', 'Sex-Based Hate', regex=True)
df['Cleaned Final Tags'].nunique()
Out[144]:
27

Sentence CleanUp¶

In [145]:
def clean_tweet(tweet):
    # Remove "RT"
    tweet = re.sub(r'\bRT\b', '', tweet)
    # Remove newlines and tabs
    tweet = tweet.replace('\n', ' ').replace('\t', ' ')
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\S+', '', tweet)
    # Remove mentions
    tweet = re.sub(r'@\S+', '', tweet)
    # Remove extra spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    return tweet
#WE will aplly this on this new colun 
df['Clean Sentence'] = df['Sentence'].apply(clean_tweet)

Visualizations¶

Tag Distribution Analysis¶

In [146]:
import matplotlib.pyplot as plt


tags = ['Derogatory Remarks', 'Sex-Based Hate', 'Faith-Based Intolerance', 
        'Ideological Intolerance', 'Ethnic Hate', 'Neutral']

tag_counts = {tag: 0 for tag in tags}


for tags_list in df['Cleaned Final Tags']:
    for tag in tags_list.split(','):
        tag = tag.strip()  # For removing whitespace lead/trail
        if tag in tag_counts:
            tag_counts[tag] += 1

tag_counts = dict(sorted(tag_counts.items(), key=lambda item: item[1], reverse=True))

# Calculate the total number of sentences
total_sentences = sum(tag_counts.values())

plt.figure(figsize=(10, 5))
bars = plt.bar(tag_counts.keys(), tag_counts.values(), color='#2F4F4F')

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height}', ha='center', va='bottom')

max_count = max(tag_counts.values())
plt.text(len(tags) - 2, max_count * 1.1, f'Total Sentences: {total_sentences}', 
         ha='center', va='bottom', fontsize=12, color='black', fontweight='bold')

plt.xlabel('Tags')
plt.ylabel('Count')
plt.title('Count of Unique Tags in Cleaned Final Tags')
plt.xticks(rotation=45)
plt.ylim(0, max_count * 1.2)  
plt.tight_layout()
plt.show()
No description has been provided for this image

Lets Analyze the Distrubution of plane Hate Based Sentences Now¶

In [10]:
df['Cleaned Final Tags'].unique()
Out[10]:
array(['Neutral',
       'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance',
       'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
       'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance',
       'Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance',
       'Derogatory Remarks, Ideological Intolerance',
       'Ethnic Hate, Faith-Based Intolerance',
       'Faith-Based Intolerance, Ideological Intolerance',
       'Sex-Based Hate', 'Ethnic Hate, Ideological Intolerance',
       'Ideological Intolerance',
       'Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
       'Ethnic Hate, Faith-Based Intolerance, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
       'Ethnic Hate, Sex-Based Hate',
       'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate'],
      dtype=object)
In [11]:
tags = ['Neutral', # we extracted  this By our our Unique() method again 
       'Derogatory Remarks, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance',
       'Domgraphy Remarks, Sex-Based Hate', 'Derogatory Remarks',
       'Derogatory Remarks, Faith-Based Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance',
       'Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance',
       'Derogatory Remarks, Ideological Intolerance',
       'Ethnic Hate, Faith-Based Intolerance',
       'Faith-Based Intolerance, Ideological Intolerance',
       'Sex-Based Hate', 'Ethnic Hate, Ideological Intolerance',
       'Ideological Intolerance',
       'Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Sex-Based Hate',
       'Derogatory Remarks, Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate',
       'Derogatory Remarks, Ethnic Hate, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate', 'Ethnic Hate',
       'Ethnic Hate, Faith-Based Intolerance, Ideological Intolerance',
       'Derogatory Remarks, Ethnic Hate, Faith-Based Intolerance',
       'Ethnic Hate, Sex-Based Hate',
       'Ethnic Hate, Faith-Based Intolerance, Sex-Based Hate',
       'Faith-Based Intolerance, Ideological Intolerance, Sex-Based Hate']

tag_counts_individual_sets = {tag: 0 for tag in tags}

# Split tags in 'Cleaned Final Tags' and count occurrences of each tag
for tags_list in df['Cleaned Final Tags']:
    tags_list = tags_list.strip()  # Remove leading/trailing spaces
    if tags_list in tag_counts_individual_sets:
        tag_counts_individual_sets[tags_list] += 1

tag_counts_df = pd.DataFrame(list(tag_counts_individual_sets.items()), columns=['Label', 'Count'])

tag_counts_df=tag_counts_df.sort_values(by='Count', ascending=False).reset_index(drop=True)


# we will add a secondy sort too 
tag_counts_df['Label_Count'] = tag_counts_df['Label'].apply(lambda x: len(x.split(',')))

# Sort by 'Count' (primary) in descending order and 'Label_Count' (secondary) in ascending order
sorted_df = tag_counts_df.sort_values(by=['Count', 'Label_Count'], ascending=[False, True]).reset_index(drop=True)
In [12]:
tag_counts_df = tag_counts_df[tag_counts_df['Label'] != 'Neutral']
plt.style.use('ggplot')
plt.figure(figsize=(15, 8))
plt.bar(tag_counts_df['Label'], tag_counts_df['Count'], color='#DB7093', edgecolor='black')
plt.xlabel('Tags')
plt.ylabel('Count')
plt.title('Count of Unique Sets and Individual Tags in Cleaned Final Tags')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
  • Well Lets try to make this Visual Better and more appealing
  • We will make Abbreveate the naming conventions
  • Derogatory Remarks: DR
  • Sex-Based Hate: SBH
  • Faith-Based Intolerance: FBI
  • Ideological Intolerance: II
  • Ethnic Hate: EH
  • All combinations of them will follow the same pattern by combining these abbreviations.
In [13]:
base_labels = {
    'Derogatory Remarks': 'DR',
    'Sex-Based Hate': 'SBH',
    'Faith-Based Intolerance': 'FBI',
    'Ideological Intolerance': 'II',
    'Ethnic Hate': 'EH',
    'Neutral': 'N'
}

def generate_custom_label(tag, base_labels):
    individual_tags = [tag.strip() for tag in tag.split(',')]
    # here we  Map each ag to its corresponding abbreviation
    abbreviations = [base_labels.get(t, t) for t in individual_tags]
    return ', '.join(abbreviations)

custom_labels = {tag: generate_custom_label(tag, base_labels) for tag in tags}
In [14]:
for label in tag_counts_df['Label'].unique():
    if label not in custom_labels:
        custom_labels[label] = label

tag_counts_df = tag_counts_df[tag_counts_df['Label'] != 'Neutral']

tag_counts_df = tag_counts_df.sort_values(by='Count', ascending=False)

plt.style.use('ggplot')
plt.figure(figsize=(15, 8))
bars = plt.barh(tag_counts_df['Label'], tag_counts_df['Count'], color='#FF7F50', edgecolor='black')

for bar in bars:
    width = bar.get_width()
    plt.text(width + 5, bar.get_y() + bar.get_height() / 2, f'{int(width)}', ha='center', va='center', fontsize=10, color='black')

plt.xlabel('Count')
plt.ylabel('Tags')
plt.title('Count of Unique Sets and Individual Tags in Cleaned Final Tags')

plt.yticks(ticks=range(len(tag_counts_df['Label'])), labels=[custom_labels[label] for label in tag_counts_df['Label']])


legend_elements = [
    Line2D([0], [0], color='red', lw=4, label='Derogatory Remarks: DR'),
    Line2D([0], [0], color='blue', lw=4, label='Sex-Based Hate: SBH'),
    Line2D([0], [0], color='green', lw=4, label='Faith-Based Intolerance: FBI'),
    Line2D([0], [0], color='purple', lw=4, label='Ideological Intolerance: II'),
    Line2D([0], [0], color='orange', lw=4, label='Ethnic Hate: EH'),
    Line2D([0], [0], color='gray', lw=4, label='All Combinations')
]
plt.legend(handles=legend_elements, loc='upper right', title='Abbrevated names')

plt.tight_layout()
plt.show()
No description has been provided for this image
  • Using this we can find that Outlier Domography Remarks ,Hiding Behind
  • we will add one Similar Visual , to see this kind of distribution
In [15]:
total_hate_sentences = tag_counts_df['Count'].sum()

prop_counts = tag_counts_df['Count'] / total_hate_sentences * 100
prop_counts = pd.concat([pd.Series([100]), prop_counts], ignore_index=True)


labels = tag_counts_df['Label']

plt.style.use('ggplot') 


fig, ax = plt.subplots(figsize=(15, 8))

ax.barh('Total Hate Sentences', 100, color='lightgrey', edgecolor='black', left=-50)

for i, (label, count) in enumerate(zip(labels, prop_counts)):
    ax.barh(label, count, color='#008080', edgecolor='black', left=-count/2)

for i, count in enumerate(prop_counts):
    ax.text(count / 2, i, f'{count:.2f}%', ha='left', va='center', fontsize=8, color='black')

ax.text(0, 'Total Hate Sentences', f'Total: {total_hate_sentences}', ha='center', va='center', fontsize=10, color='black', fontweight='bold')

ax.set_xlabel('Percentage of Total Hate Sentences')
ax.set_title('Proportional Contribution to Total Hate Sentences')

ax.grid(False)

ax.set_xlim(-50, 60)

plt.tight_layout()
plt.show()
No description has been provided for this image

Normal vs All Hate labels¶

In [16]:
normal_count = sorted_df[sorted_df['Label'] == 'Neutral']['Count'].values[0]
hate_labels_count = sorted_df[sorted_df['Label'] != 'Normal']['Count'].sum()

new_data = {
    'Label': ['Neutral', 'Hate_Labels_Comb'],
    'Count': [normal_count, hate_labels_count]
}

new_df = pd.DataFrame(new_data)
new_df

def autopct_format(pct, allvals):
    absolute = int(round(pct/100.*sum(allvals)))
    return "{:.1f}%\n({:d})".format(pct, absolute)

plt.figure(figsize=(3, 3))
plt.pie(new_df['Count'], labels=new_df['Label'], autopct=lambda pct: autopct_format(pct, new_df['Count']), 
        colors=['#1E90FF', '#FFD700'], startangle=140, wedgeprops={'edgecolor': 'black'})

plt.title("Neutral vs All Hate Labels Combined")
plt.show()
No description has been provided for this image

Next, We Will Analyze the Sentences and Their Contextual Relevance¶

Word Frequency Analysis ( WordClouds)¶

In [17]:
!pip install Pillow
Requirement already satisfied: Pillow in c:\users\pmls\anaconda3\lib\site-packages (10.2.0)
[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
In [18]:
from wordcloud import WordCloud, STOPWORDS ,ImageColorGenerator
from PIL import Image
In [19]:
text = ''.join(df['Clean Sentence'])
In [20]:
wc = WordCloud()
wc.generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.show
Out[20]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image
In [23]:
with open(r"C:\Users\PMLS\Downloads\RemovalAfterComb",mode='r', encoding='utf-8') as file:
    text=file.read()
    
custom_mask=np.array(Image.open(r"C:\Users\PMLS\Downloads\comment.png"))
#We wil remove these remaing stopwrods too , so to see the more clear picture
custom_stopwords = {'nahi', 'nhi', 'ne', 'ye', 'b', 'h', 'ke', 'hu', 'a', 'hi', 'ni', 'rt'}
    #Appearnce Related stuff 
wc = WordCloud(
    background_color= 'White',
    mask =custom_mask,
    stopwords=custom_stopwords)
wc.generate(text)



plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show
Out[23]:
<function matplotlib.pyplot.show(close=None, block=None)>
No description has been provided for this image

LetsGenrate this for Each of Our Labels now¶

In [49]:
df = pd.read_csv(r"C:\Users\PMLS\Downloads\Train_RushholdCombined.csv", usecols=[1, 8])
custom_mask = np.array(Image.open(r"C:\Users\PMLS\Downloads\comment.png").resize((300, 300)))  # Resize the mask
custom_stopwords = {'nahi', 'nhi', 'ne', 'ye', 'b', 'h', 'ke', 'hu', 'a', 'hi', 'ni', 'rt', 'hai', 'ko', 'ki', 'aur', 'hy', 'se','k','ka','tu'}

replacement_dict = {
    'Abusive/Offensive': 'Derogatory Remarks',
    'Political Hate': 'Ideological Intolerance',
    'Sexism': 'Sex-Based Hate',
    'Religious Hate': 'Faith-Based Intolerance',
    'Racism': 'Ethnic Hate',
    'Normal': 'Neutral'
}

def color_func(word, *args, **kwargs):
    return 'black'  # Set all words to be black

def generate_wordcloud_grid(df, custom_mask, custom_stopwords, replacement_dict):
    tag_column = 'Cleaned Final Tags'  # Use the column name as a string

    all_tags = set()
    for tags_list in df[tag_column].dropna():
        tags = [tag.strip() for tag in tags_list.split(',')]
        all_tags.update(tags)

    grid_size = int(np.ceil(np.sqrt(len(all_tags))))
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(20, 20))
    axes = axes.flatten()
    
    tag_sentences_dict = {}
    for tag in all_tags:
        sentences = df[df[tag_column].apply(lambda tags: tag in [t.strip() for t in tags.split(',')])]['Sentence']
        tag_sentences_dict[tag] = ' '.join(sentences)

    for i, tag in enumerate(all_tags):
        tag_text = tag_sentences_dict.get(tag, '')
        wc = WordCloud(
            background_color='white',
            mask=custom_mask,
            stopwords=custom_stopwords,
            max_words=100,
            contour_width=3,
            contour_color='black',
            color_func=color_func  # Set the color function
        ).generate(tag_text)
        
        axes[i].imshow(wc, interpolation='bilinear')
        axes[i].axis('off')
        
        subtitle = replacement_dict.get(tag, tag)
        
        axes[i].text(0.5, -0.1, subtitle, ha='center', va='center', transform=axes[i].transAxes, fontsize=12, fontweight='bold', color='black')  # Subtitle color

    # Hide unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    
    plt.tight_layout()
    plt.show()

generate_wordcloud_grid(df, custom_mask, custom_stopwords, replacement_dict)
No description has been provided for this image

Word Co_Occurence Analysis¶

In [50]:
import nltk
from nltk import bigrams
import networkx as nx
import itertools
  • Here the width of the Edges (Red line) Signifies COunt ,Higer repetation =Thicker line
In [92]:
file_path = r"C:\Users\PMLS\Downloads\RemovalAfterComb.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    sentences = file.readlines()

def extract_bigrams(text):
    tokens = word_tokenize(text)
    return list(bigrams(tokens))

all_bigrams = [bigram for sentence in sentences for bigram in extract_bigrams(sentence)]

# Count the occurrences of each bigram
bigram_freq = Counter(all_bigrams)

# Get the 20 most common bigrams
common_bigrams = bigram_freq.most_common(20)

bigram_df = pd.DataFrame(common_bigrams, columns=['bigram', 'count'])

G = nx.Graph()

# Add edges to the graph
for _, row in bigram_df.iterrows():
    bigram = row['bigram']
    count = row['count']
    G.add_edge(bigram[0], bigram[1], weight=count)

pos = nx.spring_layout(G, k=0.89, iterations=50)

plt.figure(figsize=(12, 7))
plt.grid(False)

nx.draw_networkx_nodes(G, pos, node_size=500, node_color='skyblue', alpha=0.7)
nx.draw_networkx_edges(G, pos, width=[d['weight']*0.1 for (u, v, d) in G.edges(data=True)], alpha=0.5, edge_color="red")
nx.draw_networkx_labels(G, pos, font_size=6)  # Add this line to draw node labels with smaller font size

plt.title('Network of Top 20 Most Common Bigrams', fontsize=16)
plt.show()
No description has been provided for this image
In [ ]:
 
In [97]:
model_Processed_dict_TFIDF_combined = Word2Vec.load("model_Processed_dict_TFIDF_combined_sg.model")

Similarity Analysis & Embeddings¶

For this we have Already Trained out Data on word2 Vec of Gensim Embedding Model ,

  • we will be Importing it , and performing a Similarity Analysis , based on Comman Word Embeddings

  • The Model was train on Skipgram with following Hyper Params

Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, sg=0, epochs=5)

In [130]:
keys = ['khana', 'talha', 'adalat', 'modi']

embedding_clusters = []
word_clusters = []
for word in keys:
    embeddings = []
    words = []
    for similar_word, _ in model_Processed_dict_TFIDF_combined.wv.most_similar(word, topn=15):
        words.append(similar_word)
        embeddings.append(model_Processed_dict_TFIDF_combined.wv[similar_word])
    embedding_clusters.append(embeddings)
    word_clusters.append(words)

embedding_clusters = np.array(embedding_clusters)
n, m, k = embedding_clusters.shape
tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16, 9))
    colors = cm.seismic(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.show()

tsne_plot_similar_words('Similar words Based on Embeddings', keys, embeddings_en_2d, word_clusters, 0.7, 'similar_words.png')
No description has been provided for this image
In [ ]: