- Loading Required Libraries
- Loading Metadata Information
- Fetching all the JSON Files
- Helper Functions
- Data Pre-Processing/Cleaning
- Sentence Tokenization
- Loading Flair & Elmo Contextual Biomedical Embeddings
- Dimensionality Reduction with t-SNE
- Create Clusters (K-Means) of Sentence Embeddings
- Semantic Search
- An End-To-End Closed Domain Question Answering System (CdQA)
Loading Required Libraries
## General Utilities
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import json
import re
import os
import warnings
import matplotlib.pyplot as plt'ggplot')
## Sklearn Utilities
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
## Tqdm Utilities
from tqdm import tqdm_notebook, tnrange
from import tqdm
## Bokeh Utilities
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CustomJS
from bokeh.palettes import Category20
from bokeh.transform import linear_cmap
from import output_file, show
from bokeh.transform import transform
from import output_notebook
from bokeh.plotting import figure
from bokeh.layouts import column
from bokeh.models import RadioButtonGroup
from bokeh.models import TextInput
from bokeh.layouts import gridplot
from bokeh.models import Div
from bokeh.models import Paragraph
from bokeh.layouts import column, widgetbox
## IPython Utilities
from IPython.display import HTML
import notebook as widgets
from ipywidgets import interact, interactive, fixed, interact_manual, interactive_output, VBox
from IPython.html import widgets
from IPython.display import display, Image, HTML, Markdown, clear_output
## Install flair library
!pip install flair
## Install allennlp library
!pip install allennlp
!python -m spacy download en_core_web_md
## Load Spacy Utilities:
import spacy
import en_core_web_md
nlp = en_core_web_md.load()
## Flair Utilities
from flair.embeddings import ELMoEmbeddings, PooledFlairEmbeddings, Sentence, DocumentPoolEmbeddings
from typing import List
Loading Metadata Information
root_path = '/kaggle/input/CORD-19-research-challenge/'
metadata_path = f'{root_path}/metadata.csv'
meta_df = pd.read_csv(metadata_path, dtype={
'pubmed_id': str,
'Microsoft Academic Paper ID': str,
'doi': str
sha | source_x | title | doi | pmcid | pubmed_id | license | abstract | publish_time | authors | journal | Microsoft Academic Paper ID | WHO #Covidence | has_full_text | full_text_file | |
0 | NaN | Elsevier | Intrauterine virus infections and congenital h... | 10.1016/0002-8703(72)90077-4 | NaN | 4361535 | els-covid | Abstract The etiologic basis for the vast majo... | 1972-12-31 | Overall, James C. | American Heart Journal | NaN | NaN | False | custom_license |
1 | NaN | Elsevier | Coronaviruses in Balkan nephritis | 10.1016/0002-8703(80)90355-5 | NaN | 6243850 | els-covid | NaN | 1980-03-31 | Georgescu, Leonida; Diosi, Peter; Buţiu, Ioan;... | American Heart Journal | NaN | NaN | False | custom_license |
2 | NaN | Elsevier | Cigarette smoking and coronary heart disease: ... | 10.1016/0002-8703(80)90356-7 | NaN | 7355701 | els-covid | NaN | 1980-03-31 | Friedman, Gary D | American Heart Journal | NaN | NaN | False | custom_license |
3 | aecbc613ebdab36753235197ffb4f35734b5ca63 | Elsevier | Clinical and immunologic studies in identical ... | 10.1016/0002-9343(73)90176-9 | NaN | 4579077 | els-covid | Abstract Middle-aged female identical twins, o... | 1973-08-31 | Brunner, Carolyn M.; Horwitz, David A.; Shann,... | The American Journal of Medicine | NaN | NaN | True | custom_license |
4 | NaN | Elsevier | Epidemiology of community-acquired respiratory... | 10.1016/0002-9343(85)90361-4 | NaN | 4014285 | els-covid | Abstract Upper respiratory tract infections ar... | 1985-06-28 | Garibaldi, Richard A. | The American Journal of Medicine | NaN | NaN | False | custom_license |
## Information about Metadata:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44220 entries, 0 to 44219
Data columns (total 15 columns):
sha 28462 non-null object
source_x 44220 non-null object
title 43996 non-null object
doi 40750 non-null object
pmcid 23319 non-null object
pubmed_id 22943 non-null object
license 44220 non-null object
abstract 35806 non-null object
publish_time 34197 non-null object
authors 41074 non-null object
journal 33173 non-null object
Microsoft Academic Paper ID 964 non-null object
WHO #Covidence 1767 non-null object
has_full_text 44220 non-null bool
full_text_file 32829 non-null object
dtypes: bool(1), object(14)
memory usage: 4.8+ MB
Fetching all the JSON files
all_json = glob.glob(f'{root_path}/**/*.json', recursive=True)
Helper Functions
def cstr(s, color='blue'):
return "<text style=color:{}>{}</text>".format(color, s)
def printmd(string):
## JSON File Reader Class
class FileReader:
"""FileReader adds break after every words when character length reach to certain amount."""
def __init__(self, file_path):
with open(file_path) as file:
content = json.load(file)
self.paper_id = content['paper_id']
self.abstract = []
self.body_text = []
# Abstract
for entry in content['abstract']:
# Body text
for entry in content['body_text']:
self.abstract = '\n'.join(self.abstract)
self.body_text = '\n'.join(self.body_text)
def __repr__(self):
return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
first_row = FileReader(all_json[0])
cd92f91038067e7a10aa27d676ce696e1e4d67ce: Mesenchymal stem cells have been widely studied to promote local bone regeneration of osteonecrosis of the femoral head (ONFH). Previous studies observed that dimethyloxaloylglycine (DMOG) enhanced th... Osteonecrosis of the femoral head (ONFH) is a pathological process primarily caused by interrupted local blood circulation, which can cause apoptosis of osteocytes and osseous tissue necrosis (1, 2) ....
def get_breaks(content, length):
data = ""
words = content.split(' ')
total_chars = 0
# add break every length characters
for i in range(len(words)):
total_chars += len(words[i])
if total_chars > length:
data = data + "<br>" + words[i]
total_chars = 0
data = data + " " + words[i]
return data
Convert the Data into Pandas DataFrame
dict_ = {'paper_id': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [], 'abstract_summary': []}
for idx, entry in enumerate(all_json):
if idx % (len(all_json) // 10) == 0:
print(f'Processing index: {idx} of {len(all_json)}')
content = FileReader(entry)
# get metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
# no metadata, skip this paper
if len(meta_data) == 0:
# also create a column for the summary of abstract to be used in a plot
if len(content.abstract) == 0:
# no abstract provided
dict_['abstract_summary'].append("Not provided.")
elif len(content.abstract.split(' ')) > 100:
# abstract provided is too long for plot, take first 300 words append with ...
info = content.abstract.split(' ')[:100]
summary = get_breaks(' '.join(info), 40)
dict_['abstract_summary'].append(summary + "...")
# abstract is short enough
summary = get_breaks(content.abstract, 40)
# get metadata information
meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
# if more than one author
authors = meta_data['authors'].values[0].split(';')
if len(authors) > 2:
# more than 2 authors, may be problem when plotting, so take first 2 append with ...
dict_['authors'].append(". ".join(authors[:2]) + "...")
# authors will fit in plot
dict_['authors'].append(". ".join(authors))
except Exception as e:
# if only one author - or Null valie
# add the title information, add breaks when needed
title = get_breaks(meta_data['title'].values[0], 40)
# if title was not provided
except Exception as e:
# add the journal information
Processing index: 0 of 29315
Processing index: 2931 of 29315
Processing index: 5862 of 29315
Processing index: 8793 of 29315
Processing index: 11724 of 29315
Processing index: 14655 of 29315
Processing index: 17586 of 29315
Processing index: 20517 of 29315
Processing index: 23448 of 29315
Processing index: 26379 of 29315
Processing index: 29310 of 29315
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
paper_id | abstract | body_text | authors | title | journal | abstract_summary | |
0 | cd92f91038067e7a10aa27d676ce696e1e4d67ce | Mesenchymal stem cells have been widely studie... | Osteonecrosis of the femoral head (ONFH) is a ... | Zhu, Zhen-Hong. Song, Wen-Qi... | Dimethyloxaloylglycine increases bone<br>repa... | Exp Ther Med | Mesenchymal stem cells have been widely<br>st... |
1 | bab279da548d8bd363acd5033e9dc54e7dbb7107 | Schoolchildren play a major role in the spread... | Chu, Yanhui. Wu, Zhenyu... | Effects of school breaks on influenza-like<br... | BMJ Open | Not provided. | |
2 | 71edbd57cdd9af956a12054932e0cbdb87ce1fea | Research has shown that obesity appears to spr... | The study of the effects of social networks on... | Lee, Won Joon. Youm, Yoosik... | Social Network Characteristics and Body Mass<... | J Prev Med Public Health | Research has shown that obesity appears to<br... |
3 | 2dfdbf2d6b77426866feaf93486327d372fd27c7 | There may be many reasons for the significant ... | Cha, Sung-Ho | The history of vaccination and current<br>vac... | Clin Exp Vaccine Res | Not provided. | |
4 | 0afa3ea846396533c7ca515968abcfea3f895082 | There is an emerging paradigm that the human m... | port neutrophil infiltration in inflammatory-d... | Burgess, Stacey L.. Buonomo, Erica... | Bone Marrow Dendritic Cells from Mice with an... | mBio | There is an emerging paradigm that the human<... |
dict_ = None
Data Pre-Processing/Cleaning
## Adding word count columns for both abstract and body_text
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))
paper_id | abstract | body_text | authors | title | journal | abstract_summary | abstract_word_count | body_word_count | |
0 | cd92f91038067e7a10aa27d676ce696e1e4d67ce | Mesenchymal stem cells have been widely studie... | Osteonecrosis of the femoral head (ONFH) is a ... | Zhu, Zhen-Hong. Song, Wen-Qi... | Dimethyloxaloylglycine increases bone<br>repa... | Exp Ther Med | Mesenchymal stem cells have been widely<br>st... | 246 | 3711 |
1 | bab279da548d8bd363acd5033e9dc54e7dbb7107 | Schoolchildren play a major role in the spread... | Chu, Yanhui. Wu, Zhenyu... | Effects of school breaks on influenza-like<br... | BMJ Open | Not provided. | 0 | 2838 | |
2 | 71edbd57cdd9af956a12054932e0cbdb87ce1fea | Research has shown that obesity appears to spr... | The study of the effects of social networks on... | Lee, Won Joon. Youm, Yoosik... | Social Network Characteristics and Body Mass<... | J Prev Med Public Health | Research has shown that obesity appears to<br... | 243 | 2436 |
3 | 2dfdbf2d6b77426866feaf93486327d372fd27c7 | There may be many reasons for the significant ... | Cha, Sung-Ho | The history of vaccination and current<br>vac... | Clin Exp Vaccine Res | Not provided. | 0 | 3375 | |
4 | 0afa3ea846396533c7ca515968abcfea3f895082 | There is an emerging paradigm that the human m... | port neutrophil infiltration in inflammatory-d... | Burgess, Stacey L.. Buonomo, Erica... | Bone Marrow Dendritic Cells from Mice with an... | mBio | There is an emerging paradigm that the human<... | 331 | 2937 |
## Remove Duplicates
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
## Remove NA's from data
<class 'pandas.core.frame.DataFrame'>
Int64Index: 26043 entries, 0 to 26792
Data columns (total 9 columns):
paper_id 26043 non-null object
abstract 26043 non-null object
body_text 26043 non-null object
authors 26043 non-null object
title 26043 non-null object
journal 26043 non-null object
abstract_summary 26043 non-null object
abstract_word_count 26043 non-null int64
body_word_count 26043 non-null int64
dtypes: int64(2), object(7)
memory usage: 2.0+ MB
## Taking only 12000 articles for analysis:
df_covid = df_covid.head(12000)
## Remove punctuation from each text:
df_covid['body_text'] = df_covid['body_text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
df_covid['title'] = df_covid['title'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
## Convert each text to lower case:
def lower_case(input_str):
input_str = input_str.lower()
return input_str
df_covid['body_text'] = df_covid['body_text'].apply(lambda x: lower_case(x))
df_covid['abstract'] = df_covid['abstract'].apply(lambda x: lower_case(x))
df_covid['title'] = df_covid['title'].apply(lambda x: lower_case(x))
## Considering body of articles only:
text = df_covid[["title"]]
title | |
0 | dimethyloxaloylglycine increases bonebrrepair... |
1 | effects of school breaks on influenzalikebril... |
2 | social network characteristics and body massb... |
3 | the history of vaccination and currentbrvacci... |
4 | bone marrow dendritic cells from mice with an... |
## Converting text dataframe into array:
text_arr = text.stack().tolist()
## Considering only 500 articles for analysis:
require_text = text_arr[:500]
Sentence Tokenization
## Using Spacy module for Sentence Tokenization:
sentences = []
for body in tqdm(require_text):
doc = nlp(body)
for i in doc.sents:
if len(i)>10:
## Taking those sentences only which have length more than 10
HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))
Loading Flair & Elmo Contextual Biomedical Embeddings
## Creating Document Pool Embeddings using Stacked of PooledFlairEmbeddings('pubmed-forward'), PooledFlairEmbeddings('pubmed-backward') & ELMoEmbeddings('pubmed')
document_embeddings = DocumentPoolEmbeddings([PooledFlairEmbeddings('pubmed-forward'),
2020-03-24 21:33:00,391 not found in cache, downloading to /tmp/tmpln4dfp9r
100%|██████████| 111081366/111081366 [00:13<00:00, 8476615.06B/s]
2020-03-24 21:33:14,725 copying /tmp/tmpln4dfp9r to cache at /root/.flair/embeddings/
2020-03-24 21:33:14,955 removing temp file /tmp/tmpln4dfp9r
2020-03-24 21:33:17,689 not found in cache, downloading to /tmp/tmp227ehnss
100%|██████████| 111081366/111081366 [00:27<00:00, 4066268.52B/s]
2020-03-24 21:33:46,202 copying /tmp/tmp227ehnss to cache at /root/.flair/embeddings/
2020-03-24 21:33:46,428 removing temp file /tmp/tmp227ehnss
100%|██████████| 336/336 [00:00<00:00, 474187.80B/s]
100%|██████████| 374434792/374434792 [00:24<00:00, 15539785.94B/s]
## Getting sentence embeddings for each sentence and storing those into flair_elmo_ls:
flair_elmo_ls = []
for _sent in tqdm(sentences):
example = Sentence(_sent)
HBox(children=(FloatProgress(value=0.0, max=331.0), HTML(value='')))
## Converting embeddings into numpy array :
flair_elmo_arr = [emb.cpu().detach().numpy() for emb in flair_elmo_ls]
Dimensionality Reduction with t-SNE
tsne = TSNE(verbose=1, perplexity=5)
X_embedded = tsne.fit_transform(flair_elmo_arr)
[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 331 samples in 0.163s...
[t-SNE] Computed neighbors for 331 samples in 1.927s...
[t-SNE] Computed conditional probabilities for sample 331 / 331
[t-SNE] Mean sigma: 5.320273
[t-SNE] KL divergence after 250 iterations with early exaggeration: 88.668213
[t-SNE] KL divergence after 1000 iterations: 1.362305
Create Clusters (K-Means) of Sentence Embeddings
from sklearn.cluster import MiniBatchKMeans
k = 20
kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(flair_elmo_arr)
y = y_pred
from matplotlib import pyplot as plt
import seaborn as sns
import random
# sns settings
# let's shuffle the list so distinct colors stay next to each other
palette = sns.hls_palette(20, l=.4, s=.9)
# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y, legend='full', palette=palette)
plt.title("t-SNE Covid-19 Articles - Clustered (K-Means) - Flair & Elmo Biomedical Embeddings")
y_labels = y_pred
# data sources
source = ColumnDataSource(data=dict(
x= X_embedded[:,0],
y= X_embedded[:,1],
x_backup = X_embedded[:,0],
y_backup = X_embedded[:,1],
desc= y_labels,
titles= df_covid['title'],
authors = df_covid['authors'],
journal = df_covid['journal'],
abstract = df_covid['abstract_summary'],
labels = ["C-" + str(x) for x in y_labels]
# hover over information
hover = HoverTool(tooltips=[
("Title", "@titles{safe}"),
("Author(s)", "@authors"),
("Journal", "@journal"),
("Abstract", "@abstract{safe}"),
# map colors
mapper = linear_cmap(field_name='desc',
low=min(y_labels) ,high=max(y_labels))
# prepare the figure
p = figure(plot_width=800, plot_height=800,
tools=[hover, 'pan', 'wheel_zoom', 'box_zoom', 'reset'],
title="t-SNE Covid-19 Articles, Clustered(K-Means), Flair & Elmo Biomedical Embeddings",
# plot
p.scatter('x', 'y', size=5,
legend = 'labels')
# add callback to control
callback = CustomJS(args=dict(p=p, source=source), code="""
var radio_value =;
var data =;
x = data['x'];
y = data['y'];
x_backup = data['x_backup'];
y_backup = data['y_backup'];
labels = data['desc'];
if (radio_value == '20') {
for (i = 0; i < x.length; i++) {
x[i] = x_backup[i];
y[i] = y_backup[i];
else {
for (i = 0; i < x.length; i++) {
if(labels[i] == radio_value) {
x[i] = x_backup[i];
y[i] = y_backup[i];
} else {
x[i] = undefined;
y[i] = undefined;
# callback for searchbar
keyword_callback = CustomJS(args=dict(p=p, source=source), code="""
var text_value = cb_obj.value;
var data =;
x = data['x'];
y = data['y'];
x_backup = data['x_backup'];
y_backup = data['y_backup'];
abstract = data['abstract'];
titles = data['titles'];
authors = data['authors'];
journal = data['journal'];
for (i = 0; i < x.length; i++) {
if(abstract[i].includes(text_value) ||
titles[i].includes(text_value) ||
authors[i].includes(text_value) ||
journal[i].includes(text_value)) {
x[i] = x_backup[i];
y[i] = y_backup[i];
} else {
x[i] = undefined;
y[i] = undefined;
# option
option = RadioButtonGroup(labels=["C-0", "C-1", "C-2",
"C-3", "C-4", "C-5",
"C-6", "C-7", "C-8",
"C-9", "C-10", "C-11",
"C-12", "C-13", "C-14",
"C-15", "C-16", "C-17",
"C-18", "C-19", "All"],
active=20, callback=callback)
# search box
keyword = TextInput(title="Search:", callback=keyword_callback)
header = Div(text="""<h1>COVID-19 Articles Cluster</h1>""")
# show
show(column(header, widgetbox(option, keyword),p))
BokehUserWarning: ColumnDataSource's columns must be of the same length. Current lengths: ('abstract', 12000), ('authors', 12000), ('desc', 331), ('journal', 12000), ('labels', 331), ('titles', 12000), ('x', 331), ('x_backup', 331), ('y', 331), ('y_backup', 331)
BokehDeprecationWarning: 'legend' keyword is deprecated, use explicit 'legend_label', 'legend_field', or 'legend_group' keywords instead
Semantic Search
def get_similarity(search_string, results_returned = 3):
example_text = Sentence(search_string)
search_vect = example_text.get_embedding()
search_vect = search_vect.cpu().detach().numpy()
cosine_similarities = pd.Series(cosine_similarity([search_vect], flair_elmo_arr).flatten())
output =""
for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
output +='<p style="font-family:verdana; font-size:110%;"> '
for i in sentences[i].split():
if i.lower() in search_string:
output += " <b>"+str(i)+"</b>"
output += " "+str(i)
output += "</p><hr>"
output = '<h3>Results:</h3>'+output
text = widgets.Text(
value='virus genetics, origin, and evolution',
placeholder='Paste ticket description here!',
layout=widgets.Layout(width='50%', height='50px')
out = widgets.Output()
def callback(_):
with out:
# what happens when we press the button
printmd("**<font color=orange> -------------------------------------------------------------------------------------------------------- </font>**")
printmd(f"**<font color=blue>Semantic Search has Started </font>**")
printmd("**<font color=orange> -------------------------------------------------------------------------------------------------------- </font>**")
# displaying button and its output together
widgets.VBox([text, out])
VBox(children=(Text(value='virus genetics, origin, and evolution', description='Query:', layout=Layout(height=…
An End-To-End Closed Domain Question Answering System (CdQA)
# Install an End-To-End Closed Domain Question Answering System
!pip install cdqa
## Load Cdqa Utilities:
from ast import literal_eval
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from import download_model
## Download BERT Squad 1.1 Pretrained Q&A Model
download_model(model='bert-squad_1.1', dir='./models')
## Converting body_text into different paragraphs :
df_covid["paragraphs"] = [x.split('\n') for x in df_covid["body_text"]]
df = filter_paragraphs(df_covid)
paper_id | abstract | body_text | authors | title | journal | abstract_summary | abstract_word_count | body_word_count | paragraphs | |
0 | cd92f91038067e7a10aa27d676ce696e1e4d67ce | mesenchymal stem cells have been widely studie... | osteonecrosis of the femoral head onfh is a pa... | Zhu, Zhen-Hong. Song, Wen-Qi... | dimethyloxaloylglycine increases bonebrrepair... | Exp Ther Med | Mesenchymal stem cells have been widely<br>st... | 246 | 3711 | [osteonecrosis of the femoral head onfh is a p... |
1 | bab279da548d8bd363acd5033e9dc54e7dbb7107 | schoolchildren play a major role in the spread... | Chu, Yanhui. Wu, Zhenyu... | effects of school breaks on influenzalikebril... | BMJ Open | Not provided. | 0 | 2838 | [schoolchildren play a major role in the sprea... | |
2 | 71edbd57cdd9af956a12054932e0cbdb87ce1fea | research has shown that obesity appears to spr... | the study of the effects of social networks on... | Lee, Won Joon. Youm, Yoosik... | social network characteristics and body massb... | J Prev Med Public Health | Research has shown that obesity appears to<br... | 243 | 2436 | [the study of the effects of social networks o... |
3 | 2dfdbf2d6b77426866feaf93486327d372fd27c7 | there may be many reasons for the significant ... | Cha, Sung-Ho | the history of vaccination and currentbrvacci... | Clin Exp Vaccine Res | Not provided. | 0 | 3375 | [there may be many reasons for the significant... | |
4 | 0afa3ea846396533c7ca515968abcfea3f895082 | there is an emerging paradigm that the human m... | port neutrophil infiltration in inflammatorydi... | Burgess, Stacey L.. Buonomo, Erica... | bone marrow dendritic cells from mice with an... | mBio | There is an emerging paradigm that the human<... | 331 | 2937 | [port neutrophil infiltration in inflammatoryd... |
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib')
QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
do_lower_case=True, fp16=False,
gradient_accumulation_steps=1, learning_rate=5e-05,
local_rank=-1, loss_scale=0, max_answer_length=30,
n_best_size=20, no_cuda=False,
null_score_diff_threshold=0.0, num_train_epochs=3.0,
output_dir=None, predict_batch_size=8, seed=42,
server_ip='', server_po...size=8,
verbose_logging=False, version_2_with_negative=False,
warmup_proportion=0.1, warmup_steps=0),
retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
max_df=0.85, min_df=2, ngram_range=(1, 2),
preprocessor=None, stop_words='english',
tokenizer=None, top_n=20, verbose=False,
def get_cdqa_prediction(x):
prediction = cdqa_pipeline.predict(x)
question = '<h3>Question:</h3>'+x
answer = '<h3>Answer:</h3>'+prediction[0]
title = '<h3>Title:</h3>'+prediction[1]
paragraph = '<h3>Paragraph:</h3>'+prediction[2]
text = widgets.Text(
value='What do we know about diagnostics and surveillance?',
placeholder='Paste ticket description here!',
layout=widgets.Layout(width='50%', height='50px')
out = widgets.Output()
def callback(_):
with out:
# what happens when we press the button
printmd("**<font color=orange> ------------------------------------------------------------------------------------------------------------------------------- </font>**")
printmd(f"**<font color=blue>COVID-19 (Question & Answering System)</font>**")
printmd("**<font color=orange> ------------------------------------------------------------------------------------------------------------------------------- </font>**")
# displaying button and its output together
widgets.VBox([text, out])
VBox(children=(Text(value='What do we know about diagnostics and surveillance?', description='Question:', layo…