import pandas as pd
import csv
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
def read_data(path):
# 读取CSV文件
df = pd.read_csv(path)
# 将每一列数据保存在列表中
column_data = []
for column in df.columns:
column_data.append(df[column].tolist())
return column_data
def get_fd(path):
data = read_data(path)
fd_from = [fd for fd in zip(data[1], data[2], data[3])]
fd_to = [fd for fd in zip(data[4], data[5], data[6])]
fd_from_uni = set(fd_from)
fd_to_uni = set(fd_to)
return fd_from, fd_to, fd_from_uni, fd_to_uni
def get_tb(path):
data = read_data(path)
td_from = [fd for fd in zip(data[1], data[2])]
td_to = [fd for fd in zip(data[4], data[5])]
td_from_uni = set(td_from)
td_to_uni = set(td_to)
tasks = data[0]
return td_from, td_to, td_from_uni, td_to_uni, tasks
def get_db(path):
data_all = read_data(path)
db_from, db_to = data_all[1], data_all[4]
tasks = data_all[0]
return db_from, db_to, tasks
def cons_db_dict_graph(path):
graph = {}
db_from, db_to, tasks = get_db(path)
db_all = db_from + db_to
for db in db_all:
if db not in graph:
graph[db] = []
for db_f, db_t in zip(db_from, db_to):
have_nodes = graph[db_f]
# 处理重复边,和自己指向自己的边
if db_t not in have_nodes and db_t != db_f:
graph[db_f].append(db_t)
return graph
def save_dict_graph(graph, filename):
with open(filename, 'w') as f:
for node, neighbors in graph.items():
neighbors_str = ".".join(map(str, neighbors))
# f.write(f"{str(node)}: {', '.join(str(neighbors))}\n")
f.write(f"{node}:{neighbors_str}\n")
print(f"Graph data saved to {filename}")
def load_graph(filename):
graph = {}
with open(filename, 'r') as f:
for line in f:
parts = line.strip().split(':')
node = int(parts[0])
neighbors = []
if len(parts) > 1 and parts[1]:
neighbors = list(map(int, parts[1].split('.')))
graph[node] = neighbors
return graph
def dfs(graph, start, path=[], paths=[]):
path = path + [start]
if start not in graph:
return paths
for node in graph[start]:
if node not in path:
paths = dfs(graph, node, path, paths)
paths.append(path) # 添加起点到当前节点的路径
return paths
from collections import deque
def bfs(graph, start, end):
queue = deque([(start, [start])]) # 初始化队列,起始节点和路径
while queue:
current, path = queue.popleft() # 从队列中取出当前节点和路径
if current == end: # 如果当前节点是终点,则返回路径
return path
for neighbor in graph.get(current, []): # 遍历当前节点的邻居节点
if neighbor not in path: # 如果邻居节点不在路径中
queue.append((neighbor, path + [neighbor])) # 将邻居节点和更新后的路径加入队列
return None # 如果没有找到路径,则返回 None
def cons_graph_deduplicate_edges(start, end, tasks):
deduplicate_edges = {}
start_uni = []
end_uni = []
for edge in zip(start, end):
if edge not in deduplicate_edges:
deduplicate_edges[edge] = 1
start_uni.append(edge[0])
end_uni.append(edge[1])
else:
deduplicate_edges[edge] += 1
print(len(start))
print(len(deduplicate_edges))
edges = pd.DataFrame()
edges["start"] = start_uni
edges["end"] = end_uni
G = nx.from_pandas_edgelist(edges, source="start", target="end")
return G
def print_all_paths(G, start):
# 使用广度优先搜索找到所有路径
for end in G.nodes():
if start == end:
continue
paths = list(nx.all_simple_paths(G, start, end))
if paths:
print(f"从节点 {start} 到节点 {end} 的路径:")
for path in paths:
print(path)
def db_graph_info():
path = './process_ret/re_code_all.csv'
db_from, db_to, tasks = get_db(path)
G = cons_graph_deduplicate_edges(db_from, db_to, tasks)
start_node = 1
print_all_paths(G, start_node)
# print(nx.degree(G))
# print(len(list(nx.connected_components(G))))
# print(list(nx.connected_components(G))[0])
# print(list(nx.connected_components(G))[1])
# print(list(nx.connected_components(G))[2])
# nx.draw(G, with_labels=True, edge_color='b', node_color='g', node_size=1000)
# plt.show()
def db_layer(fr, to, path='./process_ret/tb_nums_in_db_degree.csv'):
data = read_data(path)
db_code, out_d, in_d = data[0], data[2], data[3]
db_degree = {}
for info in zip(db_code, out_d, in_d):
db, od, id = info
db_degree[db] = [od, id]
# print(db_degree)
all_db = fr + to
all_db = list(set(all_db))
# print(len(all_db))
layer = []
layer_cur = []
index = 0
# 单独处理第一层
for db in all_db:
# 如果入度为0
if db_degree[db][1] == 0:
layer_cur.append(db)
all_db.remove(db)
layer.append(layer_cur)
layer_cur.clear()
layer_last = []
while len(all_db) > 0:
print(f"before process len of all_db : {len(all_db)}")
# 处理中间的层
# 如果入度不为0,出度不为0,并且入度来自上一层,
for db in all_db:
if db_degree[db][0] != 0 and db_degree[db][1] != 0: # 出度和入度不为0
for edge in zip(fr, to):
f, t = edge
if t == db and f in layer[index]: # 入度节点是来自于上一层
layer_cur.append(db)
all_db.remove(db)
break
if db_degree[db][0] == 0 and db_degree[db][1] != 0: # 出度为0
layer_last.append(db)
all_db.remove(db)
print(f"after process len of all_db : {len(all_db)}")
layer.append(layer_cur)
layer_cur.clear()
index += 1
layer.append(layer_last)
print(len(layer))
def cons_nx_graph(start_fd, end_fd, weights):
edges = pd.DataFrame()
edges["start"] = start_fd
edges["end"] = end_fd
edges["weights"] = weights
G = nx.from_pandas_edgelist(edges,source="start",target="end",edge_attr="weights")
return G
# 之前的入度出度文件此处不能用,因为没有去除重复表达以及自身到自身的度
def db_layer_handle_circle(fr, to, path='./process_ret/tb_nums_in_db_degree.csv'):
layers = []
layer_first = []
layer_last = []
all_db = list(set(fr + to))
print(f"all uni fr len:{len(fr)}, to len: {len(to)}")
print(f"all uni db len:{len(all_db)}")
# data = read_data(path)
# db_code, out_d, in_d = data[0], data[2], data[3]
# print(f"all uni db_code len:{len(db_code)}")
# 重新构造出度和入度的数据,排除重复表达的以及自身到自身的边
# 减少数据量,去掉重复的from - to
relations = [(f, t) for f, t in zip(fr, to)]
print(f"len of relation : {len(relations)}")
print(f"ele relation : {relations[0]}")
uni_relation = list(set(relations)) # 去除完全相同的边
uni_relation = [(f, t) for f, t in uni_relation if f != t] # 去除自身到自身的边
print(f"uni relations len : {len(uni_relation)}")
deduplicate_f = [f for f, _ in uni_relation]
deduplicate_t = [t for _, t in uni_relation]
dict_of_degree = {db: [deduplicate_f.count(db), deduplicate_t.count(db)] for db in all_db}
print(f"len of deduplicate_f and t : {len(deduplicate_f)} , {len(deduplicate_t)}")
print(f"deduplicate_f {deduplicate_f}")
print(f"deduplicate_t {deduplicate_t}")
print(f"dict degree : {dict_of_degree}")
print(f"len of dict degree : {len(dict_of_degree)}")
db_code, out_d, in_d = [], [], []
for k, v in dict_of_degree.items():
db_code.append(k)
out_d.append(v[0])
in_d.append(v[1])
print(f"db code : {db_code}")
print(f"db out_d : {out_d}")
print(f"db in_d : {in_d}")
# # 看看连通关系
# G = cons_nx_graph(deduplicate_f, deduplicate_t, deduplicate_f)
# print(f"lian tong",len(list(nx.connected_components(G))))
print(f"len of rest db all : {len(all_db)}")
# 找入度为0的结点,作为第一层
for info in zip(db_code, out_d, in_d):
db, od, id = info
if id == 0 and od != 0:
layer_first.append(db)
all_db.remove(db)
if od == 0 and id == 0:
layer_first.append(db)
all_db.remove(db)
print(f"len of first layer : {len(layer_first)}")
# 找出度为0的结点,作为最后层
for info in zip(db_code, out_d, in_d):
db, od, id = info
if od == 0 and id != 0 :
layer_last.append(db)
all_db.remove(db)
# print(f"len of first layer: {len(layer_first)}")
# print(f"ele of first layer: {layer_first}")
layers.append(layer_first)
print(f"len of last layer : {len(layer_last)}\n")
# 验证
# uni_re = {}
# for re in zip(fr,to):
# if re not in uni_re:
# uni_re[re] = 1
# print(f"uni dict relation len : {len(uni_re)}")
# 找中间层
index = 0
layer_cur = []
print(f"len of rest db all : {len(all_db)}")
while len(all_db) > 0:
# step1,寻找直接来自于上一层的结点,全部找完之后进行下一步
# for db in all_db:
# for re in uni_relation:
# f, t = re
# if db == t and f in layers[index] and t not in layer_cur: # 某条f到t的关系满足起点f在上一层,终点t是当前的结点
# # print(f"from : {f}, to : {t}")
# # print(f"process cur db : {db}")
# tp = db
# layer_cur.append(tp)
# all_db.remove(db)
# for re in uni_relation:
# f, t = re
# if f in layers[index] and t not in layer_cur and t in all_db:
# # print(f"from : {f}, to : {t}")
# layer_cur.append(t)
# all_db.remove(t)
for db in layers[index]:
for re in uni_relation:
f, t = re
if db == f and t in all_db:
layer_cur.append(t)
all_db.remove(t)
# print(f"cur layer len : {len(layer_cur)} , db : {layer_cur}")
# step2,寻找指向本层的结点,因为要找完,不确定有没有,并且顺序也不知道
flag = 1
# for re in uni_relation:
# f, t = re
# if t in layer_cur and f in all_db: # 存在一个边指向本层结点,并且起点是没使用过的,表明可以开始寻找
# flag = 1
while flag != 0:
flag = 0
for re in uni_relation:
f, t = re
if t in layer_cur and f in all_db: # 存在一个边指向本层结点,并且起点是没使用过的,表明可以开始寻找
flag = 1
layer_cur.append(f)
all_db.remove(f)
layers.append(layer_cur)
print(f"len of cur layer : {len(layer_cur)}")
print(f"len of rest db all : {len(all_db)}")
print("\n")
index += 1
layer_cur.clear()
# print(f"index : {index}")
# print(f"layer all len: {len(layers)}")
path = './process_ret/re_code_all.csv'
data = read_data(path)
fr, to = data[1], data[4]
db_layer_handle_circle(fr, to)
# db_graph_info()
# G = cons_db_dict_graph(path)
# print(G)
# save_dict_graph(G, './process_ret/db_graph.txt')
# start = 1
# end = 238
# print(bfs(G,start,end))
# PATH = dfs(G, start)
# print(PATH)
# g = load_graph('./process_ret/db_graph.txt')
# print(g)