As part of my Undergraduate Research Apprenticeship at UC Berkeley, I contributed to a group research initiative led by Jonathan Old. Our team examined how mandated political representation (reservations) in India influences low-level subnational conflict.
Using over 195,000 text-based conflict event records from ACLED and SATP, I applied natural language processing techniques to classify conflict type, analyze sentiment, and identify patterns in political framing. This work involved developing a BERT-based classifier to categorize events and a preprocessing script to clean and tokenize conflict notes.
We developed a formal model of targeted public goods provision, which predicts reduced conflict when reservations benefit large, economically disadvantaged groups, particularly when politicians face reelection incentives. These predictions were tested using constituency-level conflict data and village-level survey results.
Empirical findings show that in constituencies where reservations do not rotate, conflict tends to decrease, supporting the idea that consistent minority representation can reduce political tensions. In contrast, conflict levels remain unchanged or slightly increase at the village level where reservations rotate and reelection is not possible. These results suggest that the design of reservation policies—especially term limits and incentive structures—plays a critical role in shaping their impact on political stability.
BERT + Regex classifier for political event types from the SATP dataset.
Lemmatizes and tokenizes ACLED conflict notes with NLTK for downstream NLP tasks.
Uses BERT sentence embeddings and regex rules to classify SATP conflict descriptions into 19 political event categories.
from sentence_transformers import SentenceTransformer, util
import spacy
import pandas as pd
import re
class EventCategorizer:
def __init__(self):
self.model = SentenceTransformer("bert-base-nli-mean-tokens")
self.nlp = spacy.load("en_core_web_sm")
self.categories = {
1: "Police or security forces arrest rebels/terrorists",
2: "Police or security forces attack rebels/terrorists",
3: "Encounter between police/security forces and rebels/terrorists",
4: "Rebel/terrorist surrenders",
5: "Police or security forces raid and secure weapons/items",
6: "Rebels/terrorists attack civilians",
7: "Rebels/terrorists attack police/security",
8: "Rebels/terrorists attack infrastructure",
9: "Politician statement on conflict",
10: "Civilians protest peacefully",
11: "Civilians riot or act violently",
12: "Politicians reach out to rebels",
13: "Former rebel runs for office",
14: "Current rebel runs for office",
15: "Rebels/terrorists threaten politician",
16: "Police/security attack civilians",
17: "Government peace/development policy",
18: "Indian vs foreign forces clash",
19: "Rebel group infighting"
}
self.category_templates = {
1: ["Police arrested a rebel", "Security forces apprehended terrorists"],
2: ["Police attacked rebel hideout", "Security forces engaged with terrorists"],
3: ["Police clashed with rebels", "Security forces and militants exchanged fire"],
4: ["A rebel surrendered to authorities"],
5: ["Police raided a hideout and seized weapons"],
6: ["Terrorists opened fire on civilians"],
7: ["Rebels ambushed a police patrol"],
8: ["Rebels bombed a bridge"],
9: ["The Prime Minister commented on the conflict"],
10: ["Civilians held a peaceful protest"],
11: ["Civilians rioted in the streets"],
12: ["The government reached out to rebels"],
13: ["A former rebel contested elections"],
14: ["A rebel leader announced candidacy"],
15: ["A politician received threats from rebels"],
16: ["Security forces fired on protesters"],
17: ["The government announced a development program"],
18: ["Indian and Pakistani troops exchanged fire"],
19: ["Two rebel groups fought each other"]
}
self.event_patterns = {
r"police\\s+arrest.*(rebel|terrorist|maoist|cadre)": 1,
r"security\\s+forces?\\s+attack.*(rebel|terrorist|militant)": 2,
r"encounter.*(security|forces|rebel|terrorist|militant)": 3,
r"(rebel|terrorist|maoist)\\s+surrender": 4,
r"raid.*(weapons|arms|explosives)": 5,
r"(militant|terrorist|rebel).*attack.*(civilian|village)": 6,
r"(militant|terrorist|rebel).*attack.*(police|security)": 7,
r"(militant|terrorist|rebel).*attack.*(bridge|infrastructure)": 8,
r"(politician|minister).*(statement|comment).*conflict": 9,
r"peaceful\\s+protest": 10,
r"(riot|clash|violent)": 11,
r"(government|politician).*(reintegrate|amnesty|reach out)": 12,
r"former\\s+(rebel|terrorist).*(election|office)": 13,
r"current\\s+(rebel|terrorist).*(election|office)": 14,
r"(rebel|terrorist).*(threaten|warn).*politician": 15,
r"(police|security).*attack.*civilian": 16,
r"(policy|program).*peace|development|amnesty": 17,
r"clash.*(indian.*foreign|troops|forces)": 18,
r"(rebel|terrorist)\\s+group.*(fight|clash|rival)": 19
}
def calculate_similarity(self, sentence1, sentence2):
embeddings = self.model.encode([sentence1, sentence2])
score = util.pytorch_cos_sim(embeddings[0], embeddings[1])
return score.item()
def categorize_event(self, description):
best_score = 0
best_category = None
for category, templates in self.category_templates.items():
scores = [self.calculate_similarity(description, t) for t in templates]
max_score = max(scores)
if max_score > best_score:
best_score = max_score
best_category = category
for pattern, cat in self.event_patterns.items():
if re.search(pattern, description, re.IGNORECASE):
best_category = cat
break
return best_category or "Unknown"
categorizer = EventCategorizer()
df = pd.read_csv(r'C:\\Users\\Jade Chen\\Downloads\\dataset_satp_clean_for_similarity.csv')
subset = df.head(30)
subset["category"] = subset["descr_short"].apply(categorizer.categorize_event)
print(subset[["descr_short", "category"]])
counts = subset["category"].value_counts()
ratios = {cat: count / len(subset) for cat, count in counts.items()}
for cat, ratio in ratios.items():
print(f"Category {cat}: {ratio:.2f}")
Download on GitHub →
Preprocesses ACLED notes using lowercase normalization, tokenization, lemmatization, and stopword/punctuation removal.
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk import RegexpParser
from nltk.tree import Tree
from nltk import pos_tag
import re
from contextlib import redirect_stdout
import io
import seaborn as sns
import matplotlib.pyplot as plt
from textblob import TextBlob
import string
from collections import Counter
import statistics
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
file_path = r'acled_india_csv.csv'
acled_df = pd.read_csv(file_path, delimiter=';')
print(acled_df.dtypes)
def nlp_prep(data):
data = data.copy()
data['notes'] = data['notes'].str.lower()
data = data.drop_duplicates().reset_index(drop=True)
lem = WordNetLemmatizer()
stop = set(stopwords.words('english'))
cleaned_notes = []
for i in range(len(data)):
note = data['notes'].iloc[i]
note = re.sub(r'[^0-9A-Za-z]', ' ', note)
note = ''.join([char for char in note if char not in string.punctuation])
words = note.split()
tagged_words = pos_tag(words)
cleaned_words = [lem.lemmatize(word) for word, pos in tagged_words if word not in stop]
cleaned_note = ' '.join(cleaned_words)
cleaned_notes.append(cleaned_note)
data['cleaned_notes'] = cleaned_notes
return data
Download on GitHub →