# -*- coding: utf-8 -*-
#
# This file is a part of FLUTE tool, developed by MeLoDy Lab at Univ of Pitt.
# Compares list of interactions against local copy of FLUTE database
#
# __author__ = "Emilee Holtzapple"
# __reviewer__ = "Gaoxiang Zhou"
# __time__ = "March 2021"
import argparse
import re
import time
import mysql.connector
import numpy as np
import pandas as pd
[docs]def filter_protein_ints(all_ints_df: pd.DataFrame) -> pd.DataFrame:
"""
This function returns interactions that involves protein either in regulator or regulated element
Parameters
----------
all_ints_df: pd.DataFrame
All interactions from the input file, in DataFrame format, has to include the following columns:
'Regulated Name', 'Regulated ID', 'Regulated Type', 'Regulator Name', 'Regulator ID', 'Regulator Type'
Returns
-------
pt_only_ints: pd.DataFrame
Interactions that involves protein either in regulator or regulated element. Formatted with only
name (lowercase) and ID columns.
"""
rel_ints = []
for i, row in all_ints_df.iterrows():
nm1 = row['Regulated Name']
nm2 = row['Regulator Name']
type1 = row['Regulated Type'].lower()
type2 = row['Regulator Type'].lower()
up_id1 = str(row['Regulated ID'])
up_id2 = str(row['Regulator ID'])
if type1 == 'protein' or type2 == 'protein':
rel_ints.append([nm1.lower(), up_id1, nm2.lower(), up_id2])
pt_only_ints = pd.DataFrame(rel_ints, columns=['Regulated Name', 'Regulated ID', 'Regulator Name', 'Regulator ID'])
return pt_only_ints
[docs]def get_chem_id(pt_only_ints: pd.DataFrame) -> pd.DataFrame:
"""
This function adds CIDm information to protein-chemical interactions
Parameters
----------
pt_only_ints: pd.DataFrame
Interaction dataframe to be populated with CIDm information
Returns
-------
pt_only_ints: pd.DataFrame
Interaction dataframe with CIDm information filled out
"""
for z in range(pt_only_ints.shape[0]):
id1 = pt_only_ints.at[z, 'Regulated ID']
id2 = pt_only_ints.at[z, 'Regulator ID']
if id1.isdigit():
pt_only_ints.at[z, 'Regulated CIDm'] = "CIDm" + '0' * (8 - len(id1)) + id1
if id2.isdigit():
pt_only_ints.at[z, 'Regulator CIDm'] = "CIDm" + '0' * (8 - len(id2)) + id2
return pt_only_ints
[docs]def get_go_id(pt_only_ints: pd.DataFrame) -> pd.DataFrame:
"""
This function adds go information to protein-biological process interactions
Parameters
----------
pt_only_ints: pd.DataFrame
Interaction dataframe to be populated with GO information
Returns
-------
pt_only_ints: pd.DataFrame
Interaction dataframe with GO information filled out
"""
mask1 = pt_only_ints["Regulated ID"].str.upper().str.contains("GO:", na=False)
pt_only_ints.loc[mask1, "Regulated GoID"] = pt_only_ints.loc[mask1, "Regulated ID"].str.upper()
mask2 = pt_only_ints["Regulator ID"].str.upper().str.contains("GO:", na=False)
pt_only_ints.loc[mask2, "Regulator GoID"] = pt_only_ints.loc[mask2, "Regulator ID"].str.upper()
return pt_only_ints
[docs]def get_string_id(pt_only_ints: pd.DataFrame, id_stringid_df: pd.DataFrame) -> pd.DataFrame:
"""
This function adds string_id information to regulated element and regulator in interactions
Parameters
----------
pt_only_ints: pd.DataFrame
Interaction dataframe to be populated with string_id information
id_stringid_df: pd.DataFrame
Species dataframe that links id and string_id information
Returns
-------
pt_only_ints: pd.DataFrame
Interaction dataframe with string_id information filled out
"""
id_stringid_dict = dict(zip(id_stringid_df['ID'], id_stringid_df['stringID']))
pt_only_ints['Regulated stringID'] = pt_only_ints['Regulated ID'].apply(lambda x: id_stringid_dict[x[0:50]])
pt_only_ints['Regulator stringID'] = pt_only_ints['Regulator ID'].apply(lambda x: id_stringid_dict[x[0:50]])
return pt_only_ints
[docs]def get_uid(pt_only_ints: pd.DataFrame) -> pd.DataFrame:
"""
This function adds uid information to regulated element and regulator in interactions:
it is the same as ID if its corresponding stringID exists
Parameters
----------
pt_only_ints: pd.DataFrame
Interaction dataframe to be populated with uid information
Returns
-------
pt_only_ints: pd.DataFrame
Interaction dataframe with uid information filled out
"""
pt_only_ints['Regulated UID'] = \
pt_only_ints[pt_only_ints['Regulated stringID'] == pt_only_ints['Regulated stringID']]['Regulated ID']
pt_only_ints['Regulator UID'] = \
pt_only_ints[pt_only_ints['Regulator stringID'] == pt_only_ints['Regulator stringID']]['Regulator ID']
return pt_only_ints
[docs]def filter_recent_ints(f_in, year_df, x=5):
"""
This function should search input interaction file and find all interactions occurring in
papers less than X years old
Parameters
----------
f_in: str
Input filename that contains list of interactions, it shall include a column "Paper IDs"
year_df: pd.DataFrame
A dataframe with Year, PMCID, PMID columns
x : int
Integer specifying # of years, default is 5
Return
------
df: pd.DataFrame
A dataframe containing all filtered interactions occurring in papers less than X years old
"""
df = pd.read_excel(f_in)
df = df.merge(year_df, left_on='Paper IDs', right_on='PMCID')
df['Year'] = df['Year'].astype('int64')
df = df[df['Year'] >= time.localtime().tm_year - x].drop(columns=year_df.columns)
return df
[docs]def get_duplicates_ints(f_in):
"""
This function calculates the number of duplicated occurrences of an interaction in a reading set.
Interactions with same regulated ID, Regulator ID, and Paper ID are considered as duplicates.
Parameters
----------
f_in: str
Input filename that contains list of interactions, it shall include columns 'Regulated ID',
'Regulator ID', 'Paper IDs'
Return
------
duplicate_counts: pd.DataFrame
a dataframe to indicate the number of occurrences of duplicated interaction in a reading set
"""
df = pd.read_excel(f_in)
columns_to_check = ['Regulated ID', 'Regulator ID', 'Paper IDs']
duplicates = df[df.duplicated(subset=columns_to_check, keep=False)]
duplicate_counts = duplicates.groupby(columns_to_check).size().reset_index(name='Occurrence')
return duplicate_counts.sort_values(by='Occurrence', ascending=False)
[docs]class Query(object):
[docs] def __init__(self, user, password, host, database):
"""
Initialize query with credentials and configuration settings.
Parameters
----------
user : str
Name of the MySQL user where the FLUTE DB is stored, usually 'root'
password : str
Password for the MySQL user where the FLUTE DB is stored.
host : str
Host name for the local machine where the coopy of the FLUTE DB is stored, usually 'localhost'
database : str
Name of the local copy of the FLUTE DB, usually 'flute'
"""
self.user = user
self.password = password
self.host = host
self.database = database
self.db_cnx = mysql.connector.connect(user=self.user,
password=self.password,
host=self.host,
database=self.database,
charset='utf8')
[docs] def ground_string_id(self, df: pd.DataFrame) -> pd.DataFrame:
"""
This function uses the flute database to ground species name and ID to identify stringID.
Parameters
----------
df: pd.DataFrame
containing the columns of 'Name' and 'ID' of all studied species
Returns
-------
df: pd.DataFrame
New column 'stringID' added for all studied species
"""
cursor = self.db_cnx.cursor()
for i, row in df.iterrows():
common_name = str(row['Name'])
common_id = str(row['ID']).strip("\n")
if common_id:
int_query = "SELECT stringID FROM unistringmapping WHERE uniID = %s OR uniID= %s OR ogs= %s"
cursor.execute(int_query, (common_id.upper(), common_name.upper(), common_name.upper()))
for s in cursor.fetchall():
df.at[i, 'stringID'] = s[0]
cursor.close()
return df
[docs] def get_same_papers_ints(self, file_in, year_df):
"""
This function retrieves interactions from the same papers as input spreadsheet
Parameters
----------
file_in: str
File name of a spreadsheet (.xlsx) containing a column name "Paper IDs"
year_df: pd.DataFrame
A DataFrame with Year, PMCID, PMID information
Return
------
ints_same_pp: np.array
Array of interactions that are occurred in the same papers as input file, each interaction is in the
format of (protein1's external ID, protein2's external ID, mode, source in PMID)
"""
df = pd.read_excel(file_in)
pmid_list = list(set(
year_df[year_df['PMCID'].isin(df['Paper IDs'].tolist())]['PMID'].str.replace('PMID:', 'PMID0').tolist()))
cursor = self.db_cnx.cursor()
placeholders = ', '.join(['%s'] * len(pmid_list))
paper_query = f"SELECT protein1, protein2, mode, sources FROM actions_source WHERE sources IN ({placeholders})"
cursor.execute(paper_query, pmid_list)
ints_same_pp = np.array(cursor.fetchall(), dtype="U50")
if len(ints_same_pp) == 0:
return ints_same_pp
info_query = "SELECT external_id FROM protein_info WHERE internal_id=%(internal_id)s"
external_ids = [cursor.execute(info_query, {'internal_id': ele}) or cursor.fetchone()[0] for ele in
ints_same_pp[:, [0, 1]].ravel()]
cursor.close()
ints_same_pp[:, [0, 1]] = np.array(external_ids).reshape(ints_same_pp[:, [0, 1]].shape)
return ints_same_pp
[docs] def filter_pt_ints_by_scoring(self, pt_only_ints: pd.DataFrame, score_tuple: tuple) -> pd.DataFrame:
"""
This function further filters protein-only interactions via multiple tables in database, subject to score tuple.
It would return a DataFrame with scored protein-only interactions
Parameters
----------
pt_only_ints: pd.DataFrame
protein-only interactions with name/id/uid/stringID/GoID/CIDm etc. information filled out
score_tuple: tuple
a tuple of three numbers, denoting thresholds of escore, tscore, dscore
Return
------
pt_scored_ints: pd.DataFrame
interactions further filtered by database entries and score thresholds
"""
cursor = self.db_cnx.cursor(buffered=True)
df_qry1 = \
pt_only_ints[(pt_only_ints['Regulated stringID'].notna()) & (pt_only_ints['Regulator stringID'].notna())][
['Regulated stringID', 'Regulator stringID']]
df_qry1['copy_r_str'] = df_qry1['Regulator stringID']
df_qry1['copy_d_str'] = df_qry1['Regulated stringID']
df_qry1['es'] = score_tuple[0]
df_qry1['ts'] = score_tuple[1]
df_qry1['ds'] = score_tuple[2]
query1 = "SELECT protein1,protein2,escore,tscore,dscore FROM ppi_detail_v11 WHERE " + " OR ".join([
"(((protein1 = %s AND protein2 = %s) OR (protein1 = %s AND protein2 = %s)) AND "
"(escore>=%s AND tscore>=%s AND dscore>=%s))"] * len(df_qry1))
params1 = df_qry1.to_numpy().flatten().tolist()
cursor.execute(query1, params1)
obtained_rows1 = cursor.fetchall()
qry_result1a = pd.DataFrame(obtained_rows1,
columns=['Regulated stringID', 'Regulator stringID', 'escore', 'tscore', 'dscore'])
qry_merged1a = \
pd.merge(qry_result1a, pt_only_ints, on=['Regulated stringID', 'Regulator stringID'], how='left')[
['Regulated ID', 'Regulator ID', 'escore', 'tscore', 'dscore']]
qry_merged1a = qry_merged1a[(qry_merged1a['Regulated ID'].notna()) & (qry_merged1a['Regulator ID'].notna())]
qry_result1b = pd.DataFrame(obtained_rows1,
columns=['Regulator stringID', 'Regulated stringID', 'escore', 'tscore', 'dscore'])
qry_merged1b = \
pd.merge(qry_result1b, pt_only_ints, on=['Regulator stringID', 'Regulated stringID'], how='left')[
['Regulated ID', 'Regulator ID', 'escore', 'tscore', 'dscore']]
qry_merged1b = qry_merged1b[(qry_merged1b['Regulated ID'].notna()) & (qry_merged1b['Regulator ID'].notna())]
df_qry2 = pt_only_ints[(pt_only_ints['Regulated UID'].notna()) & (pt_only_ints['Regulator UID'].notna()) & (
pt_only_ints['Regulated UID'] != pt_only_ints['Regulator UID'])][['Regulated UID', 'Regulator UID']]
df_qry2['Regulated UID'] = 'uniprotkb:' + df_qry2['Regulated UID'].astype(str).str.upper()
df_qry2['Regulator UID'] = 'uniprotkb:' + df_qry2['Regulator UID'].astype(str).str.upper()
df_qry2['copy_r_uid'] = df_qry2['Regulator UID']
df_qry2['copy_d_uid'] = df_qry2['Regulated UID']
query2 = "SELECT upid1,upid2 FROM reactome WHERE " + " OR ".join(
["((upid1=%s AND upid2=%s) OR (upid1=%s AND upid2=%s))"] * len(df_qry2))
params2 = df_qry2.to_numpy().flatten().tolist()
cursor.execute(query2, params2)
obtained_rows2 = cursor.fetchall()
qry_result2a = pd.DataFrame(obtained_rows2, columns=['Regulated UID', 'Regulator UID'])
qry_result2a['Regulated UID'] = qry_result2a['Regulated UID'].astype(str).str.replace('uniprotkb:', '')
qry_result2a['Regulator UID'] = qry_result2a['Regulator UID'].astype(str).str.replace('uniprotkb:', '')
qry_result2a['escore'] = np.nan
qry_result2a['tscore'] = np.nan
qry_result2a['dscore'] = np.nan
qry_merged2a = pd.merge(qry_result2a, pt_only_ints, on=['Regulated UID', 'Regulator UID'], how='left')[
['Regulated UID', 'Regulator UID', 'escore', 'tscore', 'dscore']]
qry_merged2a = qry_merged2a[(qry_merged2a['Regulated UID'].notna()) & (qry_merged2a['Regulator UID'].notna())]
qry_result2b = pd.DataFrame(obtained_rows2, columns=['Regulator UID', 'Regulated UID'])
qry_result2b['Regulated UID'] = qry_result2b['Regulated UID'].astype(str).str.replace('uniprotkb:', '')
qry_result2b['Regulator UID'] = qry_result2b['Regulator UID'].astype(str).str.replace('uniprotkb:', '')
qry_result2b['escore'] = np.nan
qry_result2b['tscore'] = np.nan
qry_result2b['dscore'] = np.nan
qry_merged2b = pd.merge(qry_result2b, pt_only_ints, on=['Regulated UID', 'Regulator UID'], how='left')[
['Regulated UID', 'Regulator UID', 'escore', 'tscore', 'dscore']]
qry_merged2b = qry_merged2b[(qry_merged2b['Regulated UID'].notna()) & (qry_merged2b['Regulator UID'].notna())]
# FIXME: qry3 seems to be extra or in wrong format
# df_qry3 = pt_only_ints[['Regulated ID', 'Regulator ID']]
# df_qry3['copy_d_id'] = df_qry3['Regulated ID']
# df_qry3['copy_r_id'] = df_qry3['Regulator ID']
# query3 = "SELECT ogsA,ogsB FROM biogrid WHERE " + " OR ".join(
# ["( (ogsA = %s AND ogsB = %s) OR (ogsB = %s AND ogsA = %s) )"] * len(df_qry3))
# params3 = df_qry3.to_numpy().flatten().tolist()
# cursor.execute(query3, params3)
# obtained_rows3 = cursor.fetchall()
# qry_result3 = pd.DataFrame(obtained_rows3, columns=['ogsA', 'ogsB'])
df_qry4 = pt_only_ints[(pt_only_ints["Regulated GoID"].notna()) & (pt_only_ints["Regulator UID"].notna()) | (
pt_only_ints["Regulator GoID"].notna()) & (pt_only_ints["Regulated UID"].notna())][
["Regulated GoID", "Regulator UID", "Regulator GoID", "Regulated UID"]].astype(str)
query4 = "SELECT goterm,id1,evidence FROM goterms WHERE " + " OR ".join(
["( (goterm = %s AND id1 = %s) OR (goterm = %s AND id1 = %s) )"] * len(df_qry4))
params4 = df_qry4.to_numpy().flatten().tolist()
cursor.execute(query4, params4)
obtained_rows4 = cursor.fetchall()
qry_result4a = pd.DataFrame(obtained_rows4, columns=['Regulated GoID', 'Regulator UID', 'evidence'])
qry_result4a['ts'] = np.nan
qry_result4a['ds'] = np.nan
qry_merged4a = pd.merge(qry_result4a, pt_only_ints, on=['Regulated GoID', 'Regulator UID'], how='left')[
['Regulated ID', 'Regulator ID', 'evidence', 'ts', 'ds']]
qry_merged4a = qry_merged4a[(qry_merged4a['Regulated ID'].notna()) & (qry_merged4a['Regulator ID'].notna())]
qry_result4b = pd.DataFrame(obtained_rows4, columns=['Regulator GoID', 'Regulated UID', 'evidence'])
qry_result4b['ts'] = np.nan
qry_result4b['ds'] = np.nan
qry_merged4b = pd.merge(qry_result4b, pt_only_ints, on=['Regulator GoID', 'Regulated UID'], how='left')[
['Regulated ID', 'Regulator ID', 'evidence', 'ts', 'ds']]
qry_merged4b = qry_merged4b[(qry_merged4b['Regulated ID'].notna()) & (qry_merged4b['Regulator ID'].notna())]
df_qry5 = \
pt_only_ints[(pt_only_ints["Regulated CIDm"].notna()) & (pt_only_ints["Regulator stringID"].notna()) | (
pt_only_ints["Regulator CIDm"].notna()) & (pt_only_ints["Regulated stringID"].notna())][
["Regulated CIDm", "Regulator stringID", "Regulator CIDm", "Regulated stringID"]].astype(str)
# FIXME: if chem: logs.append([name1, name2, exp, pred, data]) else: ??
query5 = "SELECT chem,prot,exp,text,data FROM pci_detail WHERE " + " OR ".join(
["( (chem= %s AND prot = %s) OR (chem= %s AND prot = %s) )"] * len(df_qry5))
params5 = df_qry5.to_numpy().flatten().tolist()
cursor.execute(query5, params5)
obtained_rows5 = cursor.fetchall()
cursor.close()
qry_result5a = pd.DataFrame(obtained_rows5,
columns=['Regulated CIDm', 'Regulator stringID', 'exp', 'text', 'data'])
qry_merged5a = pd.merge(qry_result5a, pt_only_ints, on=['Regulated CIDm', 'Regulator stringID'], how='left')[
['Regulated ID', 'Regulator ID', 'exp', 'text', 'data']]
qry_merged5a = qry_merged5a[(qry_merged5a['Regulated ID'].notna()) & (qry_merged5a['Regulator ID'].notna())]
qry_result5b = pd.DataFrame(obtained_rows5,
columns=['Regulator CIDm', 'Regulated stringID', 'exp', 'text', 'data'])
qry_merged5b = pd.merge(qry_result5b, pt_only_ints, on=['Regulator CIDm', 'Regulated stringID'], how='left')[
['Regulated ID', 'Regulator ID', 'exp', 'text', 'data']]
qry_merged5b = qry_merged5b[(qry_merged5b['Regulated ID'].notna()) & (qry_merged5b['Regulator ID'].notna())]
# Concatenate arrays
concatenated_array = np.vstack((qry_merged1a.fillna('').to_numpy(dtype='U50'),
qry_merged1b.fillna('').to_numpy(dtype='U50'),
qry_merged2a.fillna('').to_numpy(dtype='U50'),
qry_merged2b.fillna('').to_numpy(dtype='U50'),
qry_merged4a.fillna('').to_numpy(dtype='U50'),
qry_merged4b.fillna('').to_numpy(dtype='U50'),
qry_merged5a.fillna('').to_numpy(dtype='U50'),
qry_merged5b.fillna('').to_numpy(dtype='U50')))
pt_scored_ints = pd.DataFrame(np.unique(concatenated_array, axis=0),
columns=["Element 1 ID", "Element 2 ID", "STRING escore", "STRING tscore",
"STRING dscore"])
return pt_scored_ints
def get_args():
parser = argparse.ArgumentParser(description="Identify high-confidence interactions in machine reading output.")
parser.add_argument('username', type=str, help="MySQL username")
parser.add_argument('password', type=str, help="MySQL password")
parser.add_argument('host', type=str, help="MySQL host")
parser.add_argument('db_name', type=str, help="MySQL database name")
parser.add_argument('input_file', type=str, help="Input file")
parser.add_argument('output_path', type=str, help="Filtered reading output path")
parser.add_argument('-g', '--drop_duplicates', action="store_true", default=False,
help="Whether to output duplicate interactions")
parser.add_argument('-p', '--keep_recent_pps', action="store_true", default=False,
help="If added, show interactions in recent papers")
parser.add_argument('-q', '--prot_query', nargs='?', const=None, default=None,
help="If exists, find interactions related to protein ID")
parser.add_argument('-r', '--add_related_ints', action="store_true", default=False,
help="If added, find additional interactions that are in the same paper set")
args = parser.parse_args()
return args
def main():
args = get_args()
user = args.username
password = args.password
name = args.db_name
host = args.host
f_in = args.input_file
out_path = args.output_path
# User need to unzip supplementary/oa_file_list.txt.zip and put it to input/
# From src/ directory, you can:
# unzip ../supplementary/oa_file_list.txt.zip -d "../examples/input/"
year_df = extract_year("../examples/input/oa_file_list.txt")
# If user wants to choose a different tuple of scores threshold from 0,0,0
score_tuple = (0, 0, 0)
# If user wants to choose a different number of years for "recent years", from x=5
x = 5
query = Query(user, password, host, name)
if args.keep_recent_pps:
optional_output = filter_recent_ints(f_in=f_in, year_df=year_df, x=x)
optional_output.to_excel(args.output_path + '_optional_recent_ints.xlsx', index=False)
if args.drop_duplicates:
optional_output = get_duplicates_ints(f_in)
optional_output.to_excel(args.output_path + '_optional_duplicated_ints.xlsx', index=False)
if args.prot_query:
fq_list = query.get_related_papers(year_df, args.prot_query)
pd.DataFrame(fq_list, columns=['Paper IDs']).to_excel(args.output_path + '_query_' + args.prot_query + '.xlsx',
index=False)
optional_output = query.get_same_papers_ints(args.output_path + '_query_' + args.prot_query + '.xlsx', year_df)
pd.DataFrame(optional_output, columns=['Regulated ID', 'Regulator ID', 'Mechanism', 'Paper IDs']).to_excel(
args.output_path + '_optional_ints_related_to_' + args.prot_query + '.xlsx', index=False)
if args.add_related_ints:
optional_output = query.get_same_papers_ints(f_in, year_df)
pd.DataFrame(optional_output, columns=['Regulated ID', 'Regulator ID', 'Mechanism', 'Paper IDs']).to_excel(
args.output_path + '_optional_ints_in_same_pps.xlsx', index=False)
query.filtered_input_ints(f_in, score_tuple, out_path)
if __name__ == '__main__':
main()