Commit 7ac9e184 authored by 陈威志's avatar 陈威志

init

parents
## PG-Agent: An Agent Powered by Page Graph
[![Paper](http://img.shields.io/badge/Paper-arxiv.2509.03536-99D4C8.svg)](https://arxiv.org/abs/2509.03536)
This is the source code for **page graph construction** and **multi-agent workflow**.
### Data Preparation
------
The open-source datasets we use are from following repositories:
- AITW & Mind2Web: [here](https://github.com/njucckevin/SeeClick/blob/main/agent_tasks/readme_agent.md)
- GUI Odyssey: [here](https://github.com/OpenGVLab/GUI-Odyssey/blob/master/README.md)
### Page Graph Construction
------
You can run the following code to construct the corresponding page graph.
```
cd document_construction
sh pre.sh
```
- AITW
```
python aitw_document/main.py
```
- Mind2Web
```
python mind2web_document/main.py
```
- GUI Odyssey
```
python odyssey_document/main.py
```
### Multi-agent Workflow
------
You can run the following code to evaluate the agent in following benchmarks with corresponding page graphs .
```
cd workflow
sh pre.sh
```
- AITW
```
python aitw/aitw_test.py
```
- Mind2Web
```
python mind2web/mind2web_test.py
```
- GUI Odyssey
```
python odyssey/odyssey_test.py
```
### Citation
------
```
@misc{chen2025pgagentagentpoweredpage,
title={PG-Agent: An Agent Powered by Page Graph},
author={Weizhi Chen and Ziwei Wang and Leyang Yang and Sheng Zhou and Xiaoxuan Tang and Jiajun Bu and Yong Li and Wei Jiang},
year={2025},
eprint={2509.03536},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2509.03536},
}
```
\ No newline at end of file
# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
# from qwen_vl_utils import process_vision_info
import random
# import cv2
import copy
from pathlib import Path
from tqdm import tqdm
import requests
from urllib.parse import quote
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from PIL import Image
import prompts
url = "http://localhost:8000/v1/chat/completions"
headers = {
"Content-Type": "application/json"
}
def chat(img_url_list: str = '', query: str = '') -> dict:
content = []
for img_url in img_url_list:
img_url = quote(img_url, safe='/:')
content.append({"type": "image_url", "image_url": {"url": img_url}})
content.append({"type": "text", "text": query})
data = {
"model": "Qwen2.5-VL-72B-Instruct",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": content}
],
'temperature':0
}
response = requests.post(url, headers=headers, data=json.dumps(data))
response = response.json()
response = response['choices'][0]['message']['content']
return response
def action2description(step):
w, h = Image.open('AITW_simplified/aitw_images/' + step['img_filename']+'.png' ).size
intention = step['goal']
action = step['action_type_text']
coord1_x, coord1_y = step['touch'][0]*w, step['touch'][1]*h
coord2_x, coord2_y = step['lift'][0]*w, step['lift'][1]*h
text = step['type_text']
if action == 'click':
descrpition = f'### Action type: {action}\n\
### Coordinates: ({coord1_x},{coord1_y})'
elif action == 'type':
descrpition = f'### Action type: {action}\n\
### Content: {text}'
else:
descrpition = action
return descrpition
def check_repeat_item(img_path, page_summary, search_document, embedding_model):
if len(search_document) == 0:
return None, None
vectorstore = FAISS.from_documents(search_document, embedding_model)
search_res = vectorstore.similarity_search(page_summary)
old_description = ""
for i, res in enumerate(search_res):
old_description += f'{i+1}. ' + res.page_content + '\n'
check_repeat_prompt = prompts.check_repeat.format(old_description=old_description)
check_repeat_res = chat([img_path], check_repeat_prompt)
sample_index = check_repeat_res.split('### Index: ')[1]
if sample_index == 'None':
return None, None
else:
sample_index = int(sample_index) - 1
old_img_path = search_res[sample_index].metadata['img_path']
double_check_res = chat([old_img_path, img_path], prompts.check_repeat_2)
double_check_res = double_check_res.split('### Conclusion: ')[1].strip()
assert double_check_res in ['Yes','No']
if double_check_res == 'No':
return None, None
repeat_index = search_res[sample_index].metadata['index']
new_summary = search_res[sample_index].page_content#check_repeat_res.split('### New Summary: ')[1]
return new_summary, repeat_index
def create_new_item(img_path, knowledge_library, search_document, embedding_model):
page_summary = chat([img_path], prompts.page_summary)
new_summary, repeat_index = check_repeat_item(img_path, page_summary, search_document, embedding_model)
if repeat_index is None:
knowledge_item = {}
knowledge_item['index'] = len(knowledge_library)
knowledge_item['page_summary'] = page_summary#.split('### Page Summary: ')[1]
knowledge_item['original_image'] = []
knowledge_item['next_page_list'] = [{'actions':[],'page_index':None}]
knowledge_library[knowledge_item['index']] = knowledge_item
search_document.append(Document(page_content = page_summary, metadata = {"index": knowledge_item['index'], "img_path": img_path}))
else:
knowledge_library[repeat_index]['page_summary'] = new_summary
search_document[repeat_index].page_content = new_summary
knowledge_item = knowledge_library[repeat_index]
return knowledge_item
def get_item(img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model):
if last_page_idx is None:
knowledge_item = create_new_item(img_path, knowledge_library, search_document, embedding_model)
redirection_flag = True
else:
redirection_res = chat([last_img_path, img_path], prompts.redirection_judge.format(action=last_action_summary))
redirection_res = redirection_res.split('### Conclusion: ')[1].strip()
assert redirection_res in ['Yes','No']
if redirection_res == 'Yes':
knowledge_item = create_new_item(img_path, knowledge_library, search_document, embedding_model)
redirection_flag = True
elif redirection_res == 'No':
knowledge_item = knowledge_library[last_page_idx]
redirection_flag = False
knowledge_item['original_image'].append(img_path.split('http://localhost:6666/aitw_images/')[1])
return knowledge_item, redirection_flag
aitw_train_data = json.load(open('aitw_annots/aitw_data_train.json','r'))
aitw_data_type_list = [ 'install','googleapps','general','single','webshopping']
embedding_model_name = "bge-m3"
embedding_model = HuggingFaceEmbeddings(model_name = embedding_model_name,model_kwargs={'device': 'cuda:0'})
for aitw_data_type in aitw_data_type_list:
knowledge_library = {}
search_document = []
selected_episode = random.sample(aitw_train_data[aitw_data_type], len(aitw_train_data[aitw_data_type]) // 10)
for episode in tqdm(selected_episode):
last_page_idx = None
last_img_path = None
last_action_summary = None
for i in range(len(episode)):
img_path = 'http://localhost:6666/aitw_images/'+episode[i]['img_filename']+'.png'
if last_page_idx is not None:
action_description = action2description(episode[i-1])
if action_description[:10] == '### Action':
last_action_summary = chat([last_img_path], prompts.action_summary.format(action_description=action_description))
else:
last_action_summary = action_description
knowledge_item, redirection_flag = get_item(img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model)
if last_page_idx is not None:
knowledge_library[last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
knowledge_library[last_page_idx]['next_page_list'][-1]['goal'] = episode[i]['goal']
if redirection_flag:
knowledge_library[last_page_idx]['next_page_list'][-1]['page_index'] = knowledge_item['index']
knowledge_library[last_page_idx]['next_page_list'].append({'actions':[],'page_index':None})
last_page_idx = knowledge_item['index']
last_img_path = img_path
f_json = open(f'{aitw_data_type}_library.json', 'w')
json.dump(knowledge_library, f_json, ensure_ascii=False, indent=4)
f_json.close()
page_summary = 'Please describe this screen containing following content with one full sentence, including \
the type of page, the function of page and the key components of the screen.'
action_summary = 'An operation has now been performed on the screen. \
Here is the type of the operation and relevant parameters:\n\
{action_description}\n\
You are required to summarize this operation with a verb phrase that begins with the given operation type.'
redirection_judge = 'You will receive the images of screens before and after operation \'{action}\'. \
You need to determine whether this operation leads to a new page, or it is just an in-page operation. \
You are required to output with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Conclusion: <\'Yes\' or \'No\'>\n\
Do not output anything else.'
check_repeat = 'You are a professional GUI agent. You will be given a screen and some descriptions. \
Your task is to find one description that best fits the current page.\n\
Here are the descriptions:\n\
{old_description}\
You should answer with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Index: <The index of chosen description, or \'None\' if none of them fits>\n\
Do not output anything else.'
check_repeat_2 = 'Are these two screens similar? You should consider the type, layout, and content of the pages comprehensively.\n\
You are required to output with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Conclusion: <\'Yes\' or \'No\'>\n\
Do not output anything else.'
\ No newline at end of file
# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
# from qwen_vl_utils import process_vision_info
import random
# import cv2
import copy
import os
from pathlib import Path
from tqdm import tqdm
import requests
from urllib.parse import quote
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
import prompts
url = "http://localhost:8000/v1/chat/completions"
headers = {
"Content-Type": "application/json"
}
def chat(img_url_list: str = '', query: str = '') -> dict:
content = []
for img_url in img_url_list:
img_url = quote(img_url, safe='/:')
content.append({"type": "image_url", "image_url": {"url": img_url}})
content.append({"type": "text", "text": query})
data = {
"model": "Qwen2.5-VL-72B-Instruct",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": content}
],
'temperature':0
}
response = requests.post(url, headers=headers, data=json.dumps(data))
response = response.json()
response = response['choices'][0]['message']['content']
return response
def get_action_summary(img_path, action):
action_type = action['operation']['op']
assert action_type in ['CLICK', 'TYPE', 'SELECT']
bbox = [int(action["bbox"]["x"]), int(action["bbox"]["y"]), int(action["bbox"]["x"] + action["bbox"]["width"]),
int(action["bbox"]["y"] + action["bbox"]["height"])]
bbox_str = f'[{bbox[0]}, {bbox[1]}, {bbox[2]}, {bbox[3]}]'
if action_type == 'CLICK':
query = prompts.click_action_summary.format(bbox=bbox_str)
elif action_type == 'TYPE':
query = prompts.type_action_summary.format(content=action['operation']['value'],bbox=bbox_str)
elif action_type == 'SELECT':
query = prompts.select_action_summary.format(content=action['operation']['value'],bbox=bbox_str)
action_summary = chat([img_path], query)
if action_summary[-1] == '.':
action_summary = action_summary[:-1]
# if len(action['pos_candidates']) > 0:
# print(action['pos_candidates'][0]['choice'])
return action_summary
def check_repeat_item(domain, img_path, page_summary, search_document, embedding_model):
if len(search_document[domain]) == 0:
return None, None
vectorstore = FAISS.from_documents(search_document[domain], embedding_model)
search_res = vectorstore.similarity_search(page_summary)
old_description = ""
for i, res in enumerate(search_res):
old_description += f'{i+1}. ' + res.page_content + '\n'
check_repeat_prompt = prompts.check_repeat.format(old_description=old_description)
check_repeat_res = chat([img_path], check_repeat_prompt)
sample_index = check_repeat_res.split('### Index: ')[1].strip()#.split('\n')[0]
if sample_index == 'None':
return None, None
else:
sample_index = int(sample_index) - 1
old_img_path = search_res[sample_index].metadata['img_path']
double_check_res = chat([old_img_path, img_path], prompts.check_repeat_2)
double_check_res = double_check_res.split('### Conclusion: ')[1].strip()
assert double_check_res in ['Yes','No']
if double_check_res == 'No':
return None, None
repeat_index = search_res[sample_index].metadata['index']
new_summary = search_res[sample_index].page_content#check_repeat_res.split('### New Summary: ')[1]
return new_summary, repeat_index
def create_new_item(domain, img_path, knowledge_library, search_document, embedding_model):
page_summary = chat([img_path], prompts.page_summary)
new_summary, repeat_index = check_repeat_item(domain, img_path, page_summary, search_document, embedding_model)
if repeat_index is None:
knowledge_item = {}
knowledge_item['index'] = len(knowledge_library[domain])
knowledge_item['page_summary'] = page_summary#.split('### Page Summary: ')[1]
knowledge_item['original_image'] = []
knowledge_item['next_page_list'] = [{'actions':[],'page_index':None}]
knowledge_library[domain][knowledge_item['index']] = knowledge_item
search_document[domain].append(Document(page_content = page_summary, metadata = {"index": knowledge_item['index'], "img_path": img_path}))
else:
knowledge_library[domain][repeat_index]['page_summary'] = new_summary
search_document[domain][repeat_index].page_content = new_summary
knowledge_item = knowledge_library[domain][repeat_index]
return knowledge_item
def get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model):
if last_page_idx is None:
knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
redirection_flag = True
else:
redirection_res = chat([last_img_path, img_path], prompts.redirection_judge.format(action=last_action_summary))
redirection_res = redirection_res.split('### Conclusion: ')[1].strip()
assert redirection_res in ['Yes','No']
if redirection_res == 'Yes':
knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
redirection_flag = True
elif redirection_res == 'No':
knowledge_item = knowledge_library[domain][last_page_idx]
redirection_flag = False
knowledge_item['original_image'].append(img_path.split('http://localhost:6667/mind2web_images/')[1])
return knowledge_item, redirection_flag
mind2web_train_data = json.load(open('mind2web_annots/mind2web_data_train.json','r'))
embedding_model_name = "bge-m3"
embedding_model = HuggingFaceEmbeddings(model_name = embedding_model_name,model_kwargs={'device': 'cuda:0'})
knowledge_library = {}
search_document = {}
selected_episode = random.sample(mind2web_train_data, len(mind2web_train_data) // 10)
for episode in tqdm(selected_episode):
last_page_idx = None
last_img_path = None
last_action_summary = None
domain = episode['domain']
if domain not in list(knowledge_library.keys()):
knowledge_library[domain] = {}
search_document[domain] = []
goal = episode['confirmed_task']
episode_id = episode['annotation_id']
action_list = episode['actions']
terminate_flag = False
for i in range(len(action_list)):
img_path = 'http://localhost:6667/mind2web_images/'+episode_id+'-'+action_list[i]['action_uid']+'.jpg'
if not os.path.exists('mind2web_images/'+episode_id+'-'+action_list[i]['action_uid']+'.jpg'):
terminate_flag = True
print('IMAGE NOT FOUND')
print(episode_id+'-'+action_list[i]['action_uid'])
break
if last_page_idx is not None:
last_action_summary = get_action_summary(last_img_path, action_list[i-1])
knowledge_item, redirection_flag = get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model)
if last_page_idx is not None:
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['goal'] = goal
if redirection_flag:
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['page_index'] = knowledge_item['index']
knowledge_library[domain][last_page_idx]['next_page_list'].append({'actions':[],'page_index':None})
last_page_idx = knowledge_item['index']
last_img_path = img_path
if terminate_flag:
continue
if len(action_list) > 1:
last_action_summary = get_action_summary(last_img_path, action_list[-1])
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['goal'] = goal
f_json = open(f'mind2web_library.json', 'w')
json.dump(knowledge_library, f_json, ensure_ascii=False, indent=4)
f_json.close()
\ No newline at end of file
page_summary = 'Please describe this screen containing following content with one full sentence, including \
the type of page, the function of page and the key components of the screen.'
click_action_summary = 'This is a page of website. The user clicks the item at coordinates {bbox}. You are required to summarize this operation beginning with \"click\". Do not mention original coordinates.'
type_action_summary = 'This is a page of website. The user types the content \"{content}\" at coordinates {bbox}. You are required to summarize this operation beginning with \"type\". Do not mention original coordinates.'
select_action_summary = 'This is a page of website. The user opens a \"Select Menu\" or \"Dropdown List\" at coordinates {bbox}, and select the option \"{content}\". You are required to summarize this operation beginning with \"select\". Do not mention original coordinates.'
redirection_judge = 'You will receive the images of screens before and after operation \'{action}\'. \
You need to determine whether this operation leads to a new page, or it is just an in-page operation. \
You are required to output with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Conclusion: <\'Yes\' or \'No\'>\n\
Do not output anything else.'
check_repeat = 'You are a professional GUI agent. You will be given a webpage and some descriptions. \
Your task is to find one description that best fits the current webpage.\n\
Here are the descriptions:\n\
{old_description}\
You should answer with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Index: <The index of chosen description, or \'None\' if none of them fits>\n\
Do not output anything else.'
check_repeat_2 = 'Are these two screens similar? You should consider the type, layout, and content of the pages comprehensively.\n\
You are required to output with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Conclusion: <\'Yes\' or \'No\'>\n\
Do not output anything else.'
# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
# from qwen_vl_utils import process_vision_info
import random
# import cv2
import copy
import os
from pathlib import Path
from tqdm import tqdm
import requests
from urllib.parse import quote
import json
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings
from PIL import Image
import numpy as np
import prompts
url = "http://localhost:8000/v1/chat/completions"
headers = {
"Content-Type": "application/json"
}
def chat(img_url_list: str = '', query: str = '') -> dict:
content = []
for img_url in img_url_list:
img_url = quote(img_url, safe='/:')
content.append({"type": "image_url", "image_url": {"url": img_url}})
content.append({"type": "text", "text": query})
data = {
"model": "Qwen2.5-VL-72B-Instruct",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": content}
],
'temperature':0
}
response = requests.post(url, headers=headers, data=json.dumps(data))
response = response.json()
response = response['choices'][0]['message']['content']
return response
def get_action_summary(img_path, step):
action = step['action']
info = step['info']
assert action in ['CLICK', 'TEXT', 'SCROLL', 'LONG_PRESS']
if action == 'CLICK' or action == "LONG_PRESS":
if info == 'KEY_HOME':
gt = 'press home to go to the home screen'
elif info == 'KEY_BACK':
gt = 'press back to go to the previous screen'
elif info == 'KEY_APPSELECT':
gt = 'go to the previous App'
elif type(info) == list:
w, h = Image.open('GUI-Odyssey-master/data/screenshots/' + step['screenshot']).size
bbox_str = f'[{int(info[0][0]/1000*w)}, {int(info[0][1]/1000*h)}]'
query = prompts.click_action_summary.format(bbox=bbox_str)
gt = chat([img_path], query)
if gt[-1] == '.':
gt = gt[:-1]
else:
raise ValueError(f'Unknown click action {info}')
elif action == 'SCROLL':
start = np.array(info[0])
end = np.array(info[1])
delta = end - start
delta_abs = np.abs(delta)
lr = 'left' if delta[0] < 0 else 'right'
ud = 'up' if delta[1] < 0 else 'down'
if delta_abs[0] > delta_abs[1]:
gt = f"scroll {lr}"
else:
gt = f"scroll {ud}"
elif action == 'TEXT':
gt = f'type {info}'
return gt
def check_repeat_item(domain, img_path, page_summary, search_document, embedding_model):
if len(search_document[domain]) == 0:
return None, None
vectorstore = FAISS.from_documents(search_document[domain], embedding_model)
search_res = vectorstore.similarity_search(page_summary)
old_description = ""
for i, res in enumerate(search_res):
old_description += f'{i+1}. ' + res.page_content + '\n'
check_repeat_prompt = prompts.check_repeat.format(old_description=old_description)
check_repeat_res = chat([img_path], check_repeat_prompt)
sample_index = check_repeat_res.split('### Index: ')[1].strip()#.split('\n')[0]
if sample_index == 'None':
return None, None
else:
sample_index = int(sample_index) - 1
old_img_path = search_res[sample_index].metadata['img_path']
double_check_res = chat([old_img_path, img_path], prompts.check_repeat_2)
double_check_res = double_check_res.split('### Conclusion: ')[1].strip()
assert double_check_res in ['Yes','No']
if double_check_res == 'No':
return None, None
repeat_index = search_res[sample_index].metadata['index']
new_summary = search_res[sample_index].page_content#check_repeat_res.split('### New Summary: ')[1]
return new_summary, repeat_index
def create_new_item(domain, img_path, knowledge_library, search_document, embedding_model):
page_summary = chat([img_path], prompts.page_summary)
new_summary, repeat_index = check_repeat_item(domain, img_path, page_summary, search_document, embedding_model)
if repeat_index is None:
knowledge_item = {}
knowledge_item['index'] = len(knowledge_library[domain])
knowledge_item['page_summary'] = page_summary#.split('### Page Summary: ')[1]
knowledge_item['original_image'] = []
knowledge_item['next_page_list'] = [{'actions':[],'page_index':None}]
knowledge_library[domain][knowledge_item['index']] = knowledge_item
search_document[domain].append(Document(page_content = page_summary, metadata = {"index": knowledge_item['index'], "img_path": img_path}))
else:
knowledge_library[domain][repeat_index]['page_summary'] = new_summary
search_document[domain][repeat_index].page_content = new_summary
knowledge_item = knowledge_library[domain][repeat_index]
return knowledge_item
def get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model):
if last_page_idx is None:
knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
redirection_flag = True
else:
redirection_res = chat([last_img_path, img_path], prompts.redirection_judge.format(action=last_action_summary))
redirection_res = redirection_res.split('### Conclusion: ')[1].strip()
assert redirection_res in ['Yes','No']
if redirection_res == 'Yes':
knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
redirection_flag = True
elif redirection_res == 'No':
knowledge_item = knowledge_library[domain][last_page_idx]
redirection_flag = False
knowledge_item['original_image'].append(img_path.split('http://localhost:6668/')[1])
return knowledge_item, redirection_flag
odyssey_data = json.load(open('data/splits/splits_random_split.json','r'))
annotations_path = 'data/annotations/'
imgs_path = 'data/screenshots/'
embedding_model_name = "bge-m3"
embedding_model = HuggingFaceEmbeddings(model_name = embedding_model_name,model_kwargs={'device': 'cuda:0'})
knowledge_library = {}
search_document = {}
selected_episode_idx = random.sample(odyssey_data['train'], len(odyssey_data['train']) // 50)
for train_idx in tqdm(selected_episode_idx):
episode = json.load(open(annotations_path + train_idx,'r'))
last_page_idx = None
last_img_path = None
last_action_summary = None
domain = episode['task_info']['category']
if domain not in list(knowledge_library.keys()):
knowledge_library[domain] = {}
search_document[domain] = []
goal = episode['task_info']['instruction']
action_list = episode['steps']
for i in range(len(action_list)):
img_path = 'http://localhost:6668/'+action_list[i]['screenshot']
if last_page_idx is not None:
last_action_summary = get_action_summary(last_img_path, action_list[i-1])
knowledge_item, redirection_flag = get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model)
if last_page_idx is not None:
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['goal'] = goal
if redirection_flag:
knowledge_library[domain][last_page_idx]['next_page_list'][-1]['page_index'] = knowledge_item['index']
knowledge_library[domain][last_page_idx]['next_page_list'].append({'actions':[],'page_index':None})
last_page_idx = knowledge_item['index']
last_img_path = img_path
f_json = open(f'odyssey_library.json', 'w')
json.dump(knowledge_library, f_json, ensure_ascii=False, indent=4)
f_json.close()
page_summary = 'Please describe this screen containing following content with one full sentence, including \
the type of page, the function of page and the key components of the screen.'
click_action_summary = 'The user clicks the item at coordinates {bbox}. You are required to summarize this operation with a verb phrase that begins with \"click\". Do not mention original coordinates.'
redirection_judge = 'You will receive the images of screens before and after operation \'{action}\'. \
You need to determine whether this operation leads to a new page, or it is just an in-page operation. \
You are required to output with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Conclusion: <\'Yes\' or \'No\'>\n\
Do not output anything else.'
check_repeat = 'You are a professional GUI agent. You will be given a screen and some descriptions. \
Your task is to find one description that best fits the current page.\n\
Here are the descriptions:\n\
{old_description}\
You should answer with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Index: <The index of chosen description, or \'None\' if none of them fits>\n\
Do not output anything else.'
check_repeat_2 = 'Are these two screens similar? You should consider the type, layout, and content of the pages comprehensively.\n\
You are required to output with the following format:\n\
### Thought: <Generate your thinking process briefly>\n\
### Conclusion: <\'Yes\' or \'No\'>\n\
Do not output anything else.'
pip install langchain
conda install -c pytorch faiss-gpu
pip install -U langchain-community
pip install sentence-transformers
pip install numpy==1.23.2
pip install -U langchain-huggingface
pip install jax
pip install jaxlib
pip install --upgrade vllm
python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2.5-VL-72B-Instruct --model Qwen2.5-VL-72B-Instruct -tp 4 --limit_mm_per_prompt image=2
This diff is collapsed.
This diff is collapsed.
AITW_ACTION_SPACE = '''
1. Click(x, y): An action of click a coordinate point on the smartphone screen and x,y is the position of the coordinate point on the screen.
Your click location should be a UI element or text on the screen.
A simple use case could be Click(100,238), which means you click the UI element at (100,238) on the current screen.
2. Type("typed_text"): An action of typing a piece of text.
A simple use case can be text("Hello, world!"), which inserts the string "Hello, world!" into the input area on the smartphone screen.
3. Scroll("direction"): This function is used to scroll the screen to a specific direction.
"direction" is a string that represents one of the four directions: "up", "down", "left", "right".
A simple use case could be Scroll("up"), which means you take a scroll up action on the current screen.
4. Back(): The action for returning to the previous step.
5. Home(): The action for returning to the homepage.
6. Enter(): The action of pressing the ENTER key to submit input content.
7. Complete: It means you think the task is complete.
'''
AITW_OBSERVATION_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
You will be given user's ultimate purpose and the previous actions that you have taken.
Your task is to carefully observe the screen, descripe it and conclude some useful clues in one sentence.
Now you can start to observe:
### User's purpose ###
<goal>
### History trajectory ###
History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
<history>
### Observation ###
"""
AITW_PLANNING_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
Your task is to plan the next action to complete user's purpose with the help of references.
I will give you several important information:
### User's purpose ###
This is the user's global purpose, and your goal is to complete it:
<goal>
### Observation ###
This is the observation of the screen and some useful clues that help you plan:
<observation>
### Global Plan ###
This is the global plan for completing user's purpose:
<global_plan>
### History trajectory ###
History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
<history>
### Reference ###
There are some reference actions that you can follow:
<reference>
Based on given information, you are required to output with following format:
1. <Please decide which sub-goal in the \"### Global Plan ###\" should be executed based on the screen image>
2. <Check if the user's global purpose has been completed. If the current screen state matches the user's global purpose, directly suggest that the task has been completed>
3. <If the global purpose is not completed: Inspired by \"### Reference ###\", you can list some actions than can possibly push the task progress or complete the goal>
"""
AITW_EXECUTION_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface.
You will be given a smartphone screenshot and a plan that you decide to take.
Before you start, I will explain the data format:
### Plan ###
This is your plan:
<action_plan>
### Action Space ###
These are the functions to interact with the phone:
{AITW_ACTION_SPACE}
### Reference ###
There are some reference actions that you can follow:
<reference>
Now please choose one action in \"### Action Space ###\" for the current screen state based on \"### Plan ###\" and \"### Reference ###\".
You should output with following format:
### Thought ###
According to \"### Plan ###\", you should first determine weather the purpose has been completed. If not, think step-by-step and output the action that should be taken currently.
### Action ###
The action you finally choose from \"### Action Space ###\". Do not output anything else.
"""
AITW_GLOBAL_PLANNING_PROMT = f'''
You are an agent that is trained to complete certain tasks on a smartphone. You will be given a screenshot of a smartphone app.
The global task you should complete is:
\"<goal>\"
Now, carefully analyze all the above content and provide your output in the following format:
### Global Plan ###
Please break down the overall task into 2~3 simple sub-goals.
Note that since you can’t see future phone screenshots, each sub-goal should be abstract, high-level, and not involve interacting with specific UI elements.
'''
PAGE_SUMMARY_PROMPT = 'Please describe this screen containing following content with one full sentence: \
the type of page, the function of page and a few key components of the screen.'
REFERENCE_FORMAT = '''{idx}.
You can take following action: {actions}.
This can help you achieve goals like: {goals}.
'''
ACTION_SUMMARY_PROMPT = 'A click operation has now been performed at coordinates {coordinates}. \
You are required to summarize this operation with a verb phrase.'
This diff is collapsed.
This diff is collapsed.
MIND2WEB_ACTION_SPACE='''
1. Click(x,y): An action of clicking a coordinate point on the web screen and x,y is the position of the coordinate point on the screen.
Your click location should be a UI element or text on the screen.
A simple use case could be Click(100,238), which means you click the UI element at (100,238) on the current screen.
2. Type(x,y,"typed_text"): An action of typing a piece of text at the positon with coordinates x and y.
A simple use case could be Type(340,212,"Where was Obama born?"), which inputs the string "Where was Obama born?" into the input area at the cordinates (340,212) on the web screen.
3. Select(x,y,"option"): An action of opening a \"Select Menu\" or \"Dropdown List\" located at coordinates (x, y) and choose an option you specify.
A simple use case could be Select(679,437,"female"), which opens the list at the coordinates (679,437) and select the option "female" from the list.
'''
MIND2WEB_OBSERVATION_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
You will be given user's ultimate purpose and the previous actions that you have taken.
Your task is to carefully observe the screen, descripe it and conclude some useful clues in one sentence.
Now you can start to observe:
### User's purpose ###
<goal>
### History trajectory ###
History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
<history>
### Observation ###
"""
MIND2WEB_GLOBAL_PLANNING_PROMT = f'''
You are an agent that is trained to complete certain tasks on the webpage. You will be given a screenshot of a website.
The global task you should complete is:
\"<goal>\"
Now, carefully analyze all the above content and provide your output in the following format:
### Global Plan ###
Please break down the overall task into 2~3 simple sub-goals.
Note that since you can’t see future webpages, each sub-goal should be abstract, high-level, and not involve interacting with specific UI elements.
'''
MIND2WEB_PLANNING_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
Your task is to plan the next action to complete user's purpose with the help of references.
I will give you several important information:
### User's purpose ###
This is the user's global purpose, and your goal is to complete it:
<goal>
### Observation ###
This is the observation of the screen and some useful clues that help you plan:
<observation>
### Global Plan ###
This is the global plan for completing user's purpose:
<global_plan>
### History trajectory ###
History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
<history>
### Reference ###
There are some reference actions that you can follow:
<reference>
Based on given information, you are required to output with following format:
1. <Please decide which sub-goal in the \"### Global Plan ###\" should be executed based on the screen image>
2. <Inspired by \"### Reference ###\", you can list some actions than can possibly push the task progress or complete the goal>
"""
MIND2WEB_EXECUTION_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface.
You will be given a screenshot of a website and a plan that you decide to take.
Before you start, I will explain the data format:
### Plan ###
This is your plan:
<action_plan>
### Reference ###
There are some reference actions that you can follow:
<reference>
### Action Space ###
These are the functions to interact with the webpage:
{MIND2WEB_ACTION_SPACE}
Now please choose one action in \"### Action Space ###\" for the current webpage based on \"### Plan ###\" and \"### Reference ###\".
You should output with following format:
### Thought ###
Think step-by-step and output the action that should be taken currently.
### Action ###
Output only one action you finally choose from \"### Action Space ###\". Do not output anything else.
"""
ACTION_SUMMARY_PROMPT = {
'click_action_summary' : 'This is a page of website. The user clicks the item at coordinates {bbox}. You are required to summarize this operation beginning with \"click\". Do not mention original coordinates.',
'type_action_summary' : 'This is a page of website. The user types the content \"{content}\" at coordinates {bbox}. You are required to summarize this operation beginning with \"type\". Do not mention original coordinates.',
'select_action_summary' : 'This is a page of website. The user opens a \"Select Menu\" or \"Dropdown List\" at coordinates {bbox}, and select the option \"{content}\". You are required to summarize this operation beginning with \"select\". Do not mention original coordinates.'
}
PAGE_SUMMARY_PROMPT = 'Please describe this screen containing following content with one full sentence, including \
the type of page, the function of page and the key components of the screen.'
REFERENCE_FORMAT = '''{idx}.
You can take following action: {actions}.
This can help you achieve goals like: {goals}.
'''
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
ODYSSEY_ACTION_SPACE = '''
1. 'CLICK: (x,y)': An action of clicking a coordinate point on the smartphone screen and x,y is the position of the coordinate point on the screen.
Your click location should be a UI element or text on the screen.
A simple use case could be 'CLICK: (100,238)', which means you click the UI element at (100,238) on the current screen.
2. 'TYPE: typed_text': An action of typing a piece of text.
A simple use case can be 'TYPE: Hello, world!', which inserts the string "Hello, world!" into the input area on the smartphone screen.
3. 'SCROLL: direction': This function is used to scroll an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
"direction" is a string that represents one of the four directions: UP, DOWN, LEFT, RIGHT.
A simple use case could be 'SCROLL: UP', which means you take a scroll up action on the current screen.
4. 'PRESS_BACK': The action for returning to the previous screen.
5. 'PRESS_HOME': The action for returning to the homepage.
6. 'PRESS_RECENT': The action to go to the previous App.
7. 'COMPLETE': It means you think the task has been completed based on current screen.
8. 'IMPOSSIBLE': It means you think the task cannot be completed based on current screen.
9. 'LONG_PRESS: (x,y)': An action of pressing a coordinate point on the smartphone screen for a long time to copy texts or download images, where x and y is the position of the coordinate point on the screen.
'''
ODYSSEY_OBSERVATION_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
You will be given user's ultimate purpose and the previous actions that you have taken.
Your task is to carefully observe the screen, descripe it and conclude some useful clues in one sentence.
Now you can start to observe:
### User's purpose ###
<goal>
### History trajectory ###
History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
<history>
### Observation ###
"""
ODYSSEY_GLOBAL_PLANNING_PROMT = f'''
You are an agent that is trained to complete certain tasks on a smartphone. You will be given a screenshot of a smartphone app.
The global task you should complete is:
\"<goal>\"
Now, carefully analyze all the above content and provide your output in the following format:
### Global Plan ###
Please break down the overall task into 2~3 simple sub-goals.
Note that since you can’t see future phone screenshots, each sub-goal should be abstract, high-level, and not involve interacting with specific UI elements.
'''
ODYSSEY_PLANNING_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
Your task is to plan the next action to complete user's purpose with the help of references.
I will give you several important information:
### User's purpose ###
This is the user's global purpose, and your goal is to complete it:
<goal>
### Observation ###
This is the observation of the screen and some useful clues that help you plan:
<observation>
### Global Plan ###
This is the global plan for completing user's purpose:
<global_plan>
### History trajectory ###
History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
<history>
### Reference ###
There are some reference actions that you can follow:
<reference>
Based on given information, you are required to output with following format:
1. <Please decide which sub-goal in the \"### Global Plan ###\" should be executed based on the screen image>
2. <Check if the user's global purpose has been completed. If the current screen state matches the user's global purpose, directly suggest that the task has been completed>
3. <If the global purpose is not completed: Inspired by \"### Reference ###\", you can list some actions than can possibly push the task progress or complete the goal>
"""
ODYSSEY_EXECUTION_PROMT = f"""
You are a smart GUI agent, capable of comprehensively understanding the GUI interface.
You will be given a smartphone screenshot and a plan that you decide to take.
Before you start, I will explain the data format:
### Plan ###
This is your plan:
<action_plan>
### Action Space ###
These are the functions to interact with the phone:
{ODYSSEY_ACTION_SPACE}
### Reference ###
There are some reference actions that you can follow:
<reference>
Now please choose one action in \"### Action Space ###\" for the current screen state based on \"### Plan ###\" and \"### Reference ###\".
You should output with following format:
### Thought ###
According to \"### Plan ###\", you should first determine weather the purpose has been completed. If not, think step-by-step and output the action that should be taken currently.
### Action ###
The action you finally choose from \"### Action Space ###\". Do not output anything else.
"""
REFERENCE_FORMAT = '''{idx}.
You can take following action: {actions}.
This can help you achieve goals like: {goals}.
'''
PAGE_SUMMARY_PROMPT = 'Please describe this screen containing following content with one full sentence, including \
the type of page, the function of page and the key components of the screen.'
ACTION_SUMMARY_PROMPT = 'The user clicks the item at coordinates {bbox}. You are required to summarize this operation with a verb phrase that begins with \"click\". Do not mention original coordinates.'
pip install langchain_community
pip install langchain_huggingface
pip install jax
pip install jaxlib
pip install faiss-gpu
pip install sentence-transformers
python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2.5-VL-72B-Instruct --model Qwen2.5-VL-72B-Instruct -tp 4 --limit_mm_per_prompt image=2
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment