Data Programming
Assisted-labeling feature to help you generate label using rules
Data Programming extension allows us to generate labels by aggregating many heuristics or rules (dubbed label function) that may not be accurate by themselves but a better predictor than random when taken as a group.
Labeling functions allow you to define weak heuristics and rules that predict a label given unlabeled data. These heuristics can be derived from expert knowledge or other labeling models. Our labeling function should be written in Python.
- Multiple keyword searches (using regular expressions) within the text. For example, in finding a severity score, we searched for the phrase in numeric format and in Roman numeral format.
- Complex pre-processor model from NLTK, spaCy, or textblob
- POS tag, sentiment, general NER, dependency parsing, create syntax tree, had list of stop words, similarity, etc
- 1.pydantic: ^1.9.2
- 2.snorkel: 0.9.9
- 3.pandas: ^1.4.4
- 4.textblob: ^0.17.1
- 5.nltk: ^3.7
- 6.spaCy: ^3.4.1
- 7.numpy: 1.22.4
- 8.scipy: 1.9.1
import re
from snorkel.labeling import labeling_function
​
# This variable or integer -1 is used for @labeling_function to abstain from voting
ABSTAIN = -1
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
​
# This function should be written as this template and correctly implements labeling_function interface.
# Variable x contain content of columns, column_names, and helper dictionary. Below are the variables.
"""
x.line: int
x.columns: List[str]
x.column_names: List[str]
x.column_name_to_index: dict # key -> column_name
"""
​
KEYWORDS = {
'business': ['workers'],
'science': ['space', 'chemistry', 'researcher', 'education', 'school', 'virus'],
'sports': ['medal', 'record', 'bicep', 'football', 'physic', 'game'],
'world': ['confrontation', 'violent', 'harrassed','fight', 'vehicle', 'government', 'employment', 'military', 'war']
}
​
@labeling_function()
def labeling_function(x):
# Implement your logic here
text = ''.join(x.columns)
for label, keyword_list in KEYWORDS.items():
for key in keyword_list:
if re.search(key, text, re.IGNORECASE):
return LABELS[label]
​
return ABSTAIN
from snorkel.preprocess.nlp import SpacyPreprocessor
​
# The SpacyPreprocessor parses the text in text_field and
# stores the new enriched representation in doc_field
spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True)
@labeling_function(pre=[spacy])
def has_person(x):
"""Ham comments mention specific people and are short."""
if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]):
return HAM
else:
return ABSTAIN
import re
from snorkel.labeling import labeling_function
​
# This variable or integer -1 is used for @labeling_function to abstain from voting
ABSTAIN = -1
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'negative' : 0,
'positive' : 1
}
​
# This function should be written as this template and correctly implements labeling_function interface.
# Variable x contain content of columns, column_names, and helper dictionary. Below are the variables.
"""
x.line: int
x.columns: List[str]
x.column_names: List[str]
x.column_name_to_index: dict # key -> column_name
"""
​
from snorkel.preprocess import preprocessor
from textblob import TextBlob
​
@preprocessor(memoize=True)
def textblob_sentiment(x):
text = ''.join(x.columns)
scores = TextBlob(text)
x.polarity = scores.sentiment.polarity
return x
​
@labeling_function(pre=[textblob_sentiment])
def labeling_function(x):
# Implement your logic here
if x.polarity > 0.16:
return LABELS['positive']
​
return LABELS['negative']
import re
from stegosaurus.annotator import target_label
​
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
​
# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
text = list(sample.values())[0]
​
# Implement your logic here
TARGET_KEYWORDS = ['confrontation', 'violent', 'harrassed','fight', 'vehicle', 'government', 'employment', 'military', 'war']
for keyword in TARGET_KEYWORDS:
keyword = keyword.replace("\\", '')
if re.search(keyword, text, re.IGNORECASE):
return True
return False
import re
from stegosaurus.annotator import target_label
​
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
​
# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['sports'])
def label_function(sample):
text = list(sample.values())[0]
​
# Implement your logic here
score = re.compile(r"\b(0|[1-9]\d*)-(0|[1-9]\d*)\b")
PATTERNS = [score]
for pattern in PATTERNS:
if re.search(pattern, text):
return True
return False
import re
from stegosaurus.annotator import target_label
​
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
​
# Labeling function definition (Start editing here!)
import spacy
nlp = spacy.load("en_core_web_sm")
NER_LABELS = ["NORP"]
​
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
text = list(sample.values())[0]
​
# Implement your logic here
spacy_pred = nlp(text)
TARGET_KEYWORDS = []
for token in spacy_pred.ents:
token_label = token.label_
if token_label in NER_LABELS:
TARGET_KEYWORDS.append(str(token))
​
for keyword in TARGET_KEYWORDS:
keyword = keyword.replace("\\", '')
if re.search(keyword, text, re.IGNORECASE):
return True
return False
import re
from stegosaurus.annotator import target_label
​
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'positive' : 0,
'negative' : 1,
}
​
# Labeling function definition (Start editing here!)
from textblob import TextBlob
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['positive'])
def label_function(sample):
text = list(sample.values())[0]
​
# Implement your logic here
scores = TextBlob(text)
polarity = scores.sentiment.polarity
​
if polarity > 0:
return True
return False
import re
from stegosaurus.annotator import target_label
​
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
​
# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['world'])
def label_function(sample):
text = sample['text'] #list(sample.values())[0]
​
# Implement your logic here
TARGET_KEYWORDS = ['confrontation', 'violent', 'harrassed','fight', 'vehicle', 'government', 'employment', 'military', 'war']
​
match_list = [re.finditer(target, text) for target in TARGET_KEYWORDS]
​
return match_list
import re
from stegosaurus.annotator import target_label
​
# This is your constant labels dictionary, SHOULD NOT BE EDITED
LABELS = {
'business' : 0,
'science' : 1,
'sports' : 2,
'world' : 3
}
​
# Labeling function definition (Start editing here!)
# Assign target label based on LABELS dictionary
@target_label(label=LABELS['sports'])
def label_function(sample):
text = sample['text']
​
# Implement your logic here
score = re.compile(r"\b(0|[1-9]\d*)-(0|[1-9]\d*)\b")
PATTERNS = [score]
​
match_list = [re.finditer(pattern, text) for pattern in PATTERNS]