init

7ac9e184 · 陈威志 · 7ac9e184 · 7ac9e184 · 7ac9e184 · 7ac9e184
Commit 7ac9e184 authored Mar 25, 2026 by 陈威志
18 changed files
--- a/README.md
+++ b/README.md
+## PG-Agent: An Agent Powered by Page Graph
+
+[![Paper](http://img.shields.io/badge/Paper-arxiv.2509.03536-99D4C8.svg)](https://arxiv.org/abs/2509.03536)
+
+This is the source code for **page graph construction** and **multi-agent workflow**.
+
+### Data Preparation
+
+------
+
+The open-source datasets we use are from following repositories:
+
+- AITW & Mind2Web: [here](https://github.com/njucckevin/SeeClick/blob/main/agent_tasks/readme_agent.md) 
+- GUI Odyssey: [here](https://github.com/OpenGVLab/GUI-Odyssey/blob/master/README.md) 
+
+### Page Graph Construction
+
+------
+
+You can run the following code to construct the corresponding page graph.
+
+```
+cd document_construction
+sh pre.sh
+```
+
+- AITW
+
+  ```
+  python aitw_document/main.py
+  ```
+
+- Mind2Web
+
+  ```
+  python mind2web_document/main.py
+  ```
+
+- GUI Odyssey
+
+  ```
+  python odyssey_document/main.py
+  ```
+
+### Multi-agent Workflow
+
+------
+
+You can run the following code to evaluate the agent in following benchmarks with corresponding page graphs .
+
+```
+cd workflow
+sh pre.sh
+```
+
+- AITW
+
+  ```
+  python aitw/aitw_test.py
+  ```
+
+- Mind2Web
+
+  ```
+  python mind2web/mind2web_test.py
+  ```
+
+- GUI Odyssey
+
+  ```
+  python odyssey/odyssey_test.py
+  ```
+
+### Citation
+
+------
+
+```
+@misc{chen2025pgagentagentpoweredpage,
+      title={PG-Agent: An Agent Powered by Page Graph}, 
+      author={Weizhi Chen and Ziwei Wang and Leyang Yang and Sheng Zhou and Xiaoxuan Tang and Jiajun Bu and Yong Li and Wei Jiang},
+      year={2025},
+      eprint={2509.03536},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI},
+      url={https://arxiv.org/abs/2509.03536}, 
+}
+```
\ No newline at end of file
--- a/document_construction/aitw_document/main.py
+++ b/document_construction/aitw_document/main.py
+# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+# from qwen_vl_utils import process_vision_info
+import random
+
+# import cv2
+import copy
+from pathlib import Path
+from tqdm import tqdm
+import requests
+from urllib.parse import quote
+import json
+from tqdm import tqdm
+from langchain_community.vectorstores import FAISS
+from langchain.schema import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from PIL import Image
+
+import prompts
+
+
+url = "http://localhost:8000/v1/chat/completions"
+
+headers = {
+    "Content-Type": "application/json"
+}
+
+def chat(img_url_list: str = '', query: str = '') -> dict:
+
+    content = []
+    for img_url in img_url_list:
+        img_url = quote(img_url, safe='/:') 
+        content.append({"type": "image_url", "image_url": {"url": img_url}})
+    content.append({"type": "text", "text": query})
+    data = {
+        "model": "Qwen2.5-VL-72B-Instruct", 
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": content}
+        ],
+        'temperature':0 
+         }
+
+    response = requests.post(url, headers=headers, data=json.dumps(data))
+    response = response.json()
+    response = response['choices'][0]['message']['content']
+    return response
+
+def action2description(step):
+    
+    w, h = Image.open('AITW_simplified/aitw_images/' + step['img_filename']+'.png' ).size
+
+    intention = step['goal']
+    action = step['action_type_text']
+    coord1_x, coord1_y = step['touch'][0]*w, step['touch'][1]*h
+    coord2_x, coord2_y = step['lift'][0]*w, step['lift'][1]*h
+    text = step['type_text']
+
+    
+    if action == 'click':
+        descrpition = f'### Action type: {action}\n\
+### Coordinates: ({coord1_x},{coord1_y})'
+    elif action == 'type':
+        descrpition = f'### Action type: {action}\n\
+### Content: {text}'
+    else:
+        descrpition = action
+    
+    return descrpition
+
+def check_repeat_item(img_path, page_summary, search_document, embedding_model):
+    if len(search_document) == 0:
+        return None, None
+    vectorstore = FAISS.from_documents(search_document, embedding_model)
+    search_res = vectorstore.similarity_search(page_summary)
+
+    old_description = ""
+    for i, res in enumerate(search_res):
+        old_description += f'{i+1}. ' + res.page_content + '\n'
+    
+    check_repeat_prompt = prompts.check_repeat.format(old_description=old_description)
+    check_repeat_res = chat([img_path], check_repeat_prompt)
+    sample_index = check_repeat_res.split('### Index: ')[1]
+    
+    if sample_index == 'None':
+        return None, None
+    else:
+        sample_index = int(sample_index) - 1
+        old_img_path = search_res[sample_index].metadata['img_path']
+        double_check_res = chat([old_img_path, img_path], prompts.check_repeat_2)
+        double_check_res = double_check_res.split('### Conclusion: ')[1].strip()
+        assert double_check_res in ['Yes','No']
+        if double_check_res == 'No':
+            return None, None
+        repeat_index = search_res[sample_index].metadata['index']
+        new_summary = search_res[sample_index].page_content#check_repeat_res.split('### New Summary: ')[1]
+        return new_summary, repeat_index
+
+
+
+def create_new_item(img_path, knowledge_library, search_document, embedding_model):
+    page_summary = chat([img_path], prompts.page_summary)
+    new_summary, repeat_index = check_repeat_item(img_path, page_summary, search_document, embedding_model)
+    if repeat_index is None:
+        knowledge_item = {}
+        knowledge_item['index'] = len(knowledge_library)
+        knowledge_item['page_summary'] = page_summary#.split('### Page Summary: ')[1]
+        knowledge_item['original_image'] = []
+        knowledge_item['next_page_list'] = [{'actions':[],'page_index':None}]
+        knowledge_library[knowledge_item['index']] = knowledge_item
+        search_document.append(Document(page_content = page_summary, metadata = {"index": knowledge_item['index'], "img_path": img_path}))
+    else:
+        knowledge_library[repeat_index]['page_summary'] = new_summary
+        search_document[repeat_index].page_content = new_summary
+        knowledge_item = knowledge_library[repeat_index]
+
+    return knowledge_item
+
+def get_item(img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model):
+    if last_page_idx is None:
+        knowledge_item = create_new_item(img_path, knowledge_library, search_document, embedding_model)
+        redirection_flag = True
+    else:
+        redirection_res = chat([last_img_path, img_path], prompts.redirection_judge.format(action=last_action_summary))
+        redirection_res = redirection_res.split('### Conclusion: ')[1].strip()
+        assert redirection_res in ['Yes','No']
+        if redirection_res == 'Yes':
+            knowledge_item = create_new_item(img_path, knowledge_library, search_document, embedding_model)
+            redirection_flag = True
+        elif redirection_res == 'No':
+            knowledge_item = knowledge_library[last_page_idx]
+            redirection_flag = False
+    knowledge_item['original_image'].append(img_path.split('http://localhost:6666/aitw_images/')[1])
+    return knowledge_item, redirection_flag
+
+
+
+aitw_train_data = json.load(open('aitw_annots/aitw_data_train.json','r'))
+aitw_data_type_list = [ 'install','googleapps','general','single','webshopping'] 
+embedding_model_name = "bge-m3"
+embedding_model = HuggingFaceEmbeddings(model_name = embedding_model_name,model_kwargs={'device': 'cuda:0'})
+
+
+for aitw_data_type in aitw_data_type_list:
+    knowledge_library = {}
+    search_document = []
+    selected_episode = random.sample(aitw_train_data[aitw_data_type], len(aitw_train_data[aitw_data_type]) // 10)
+    for episode in tqdm(selected_episode):
+        last_page_idx = None
+        last_img_path = None
+        last_action_summary = None
+        for i in range(len(episode)): 
+            img_path = 'http://localhost:6666/aitw_images/'+episode[i]['img_filename']+'.png' 
+                 
+            if last_page_idx is not None:
+                action_description = action2description(episode[i-1])
+                if action_description[:10] == '### Action':
+                    last_action_summary = chat([last_img_path], prompts.action_summary.format(action_description=action_description))
+                else:
+                    last_action_summary = action_description
+            
+            knowledge_item, redirection_flag = get_item(img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model)
+
+            if last_page_idx is not None:
+                knowledge_library[last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
+                knowledge_library[last_page_idx]['next_page_list'][-1]['goal'] = episode[i]['goal']
+                if redirection_flag:
+                    knowledge_library[last_page_idx]['next_page_list'][-1]['page_index'] = knowledge_item['index']
+                    knowledge_library[last_page_idx]['next_page_list'].append({'actions':[],'page_index':None})
+
+            last_page_idx = knowledge_item['index']
+            last_img_path = img_path
+        
+    f_json = open(f'{aitw_data_type}_library.json', 'w')
+    json.dump(knowledge_library, f_json, ensure_ascii=False, indent=4)
+    f_json.close()
+            
--- a/document_construction/aitw_document/prompts.py
+++ b/document_construction/aitw_document/prompts.py
+
+page_summary = 'Please describe this screen containing following content with one full sentence, including \
+the type of page, the function of page and the key components of the screen.'
+
+action_summary = 'An operation has now been performed on the screen. \
+Here is the type of the operation and relevant parameters:\n\
+{action_description}\n\
+You are required to summarize this operation with a verb phrase that begins with the given operation type.'
+
+redirection_judge = 'You will receive the images of screens before and after operation \'{action}\'. \
+You need to determine whether this operation leads to a new page, or it is just an in-page operation. \
+You are required to output with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Conclusion: <\'Yes\' or \'No\'>\n\
+Do not output anything else.'
+
+
+check_repeat = 'You are a professional GUI agent. You will be given a screen and some descriptions. \
+Your task is to find one description that best fits the current page.\n\
+Here are the descriptions:\n\
+{old_description}\
+You should answer with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Index: <The index of chosen description, or \'None\' if none of them fits>\n\
+Do not output anything else.'
+
+check_repeat_2 = 'Are these two screens similar? You should consider the type, layout, and content of the pages comprehensively.\n\
+You are required to output with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Conclusion: <\'Yes\' or \'No\'>\n\
+Do not output anything else.'
\ No newline at end of file
--- a/document_construction/mind2web_document/main.py
+++ b/document_construction/mind2web_document/main.py
+# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+# from qwen_vl_utils import process_vision_info
+import random
+
+# import cv2
+import copy
+import os
+from pathlib import Path
+from tqdm import tqdm
+import requests
+from urllib.parse import quote
+import json
+from tqdm import tqdm
+from langchain_community.vectorstores import FAISS
+from langchain.schema import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+
+import prompts
+
+
+url = "http://localhost:8000/v1/chat/completions"
+
+headers = {
+    "Content-Type": "application/json"
+}
+
+def chat(img_url_list: str = '', query: str = '') -> dict:
+
+    content = []
+    for img_url in img_url_list:
+        img_url = quote(img_url, safe='/:') 
+        content.append({"type": "image_url", "image_url": {"url": img_url}})
+    content.append({"type": "text", "text": query})
+    data = {
+        "model": "Qwen2.5-VL-72B-Instruct",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": content}
+        ],
+        'temperature':0 
+         }
+
+    response = requests.post(url, headers=headers, data=json.dumps(data))
+    response = response.json()
+    response = response['choices'][0]['message']['content']
+
+    return response
+
+def get_action_summary(img_path, action):
+    action_type = action['operation']['op']
+    assert action_type in ['CLICK', 'TYPE', 'SELECT']
+
+    bbox = [int(action["bbox"]["x"]), int(action["bbox"]["y"]), int(action["bbox"]["x"] + action["bbox"]["width"]),
+                int(action["bbox"]["y"] + action["bbox"]["height"])]
+    bbox_str = f'[{bbox[0]}, {bbox[1]}, {bbox[2]}, {bbox[3]}]'
+    if action_type == 'CLICK':
+        query = prompts.click_action_summary.format(bbox=bbox_str)
+    elif action_type == 'TYPE':
+        query = prompts.type_action_summary.format(content=action['operation']['value'],bbox=bbox_str)
+    elif action_type == 'SELECT':
+        query = prompts.select_action_summary.format(content=action['operation']['value'],bbox=bbox_str)
+    
+    action_summary = chat([img_path], query)
+    if action_summary[-1] == '.':
+        action_summary = action_summary[:-1]
+    # if len(action['pos_candidates']) > 0:
+    #     print(action['pos_candidates'][0]['choice'])
+    return action_summary 
+
+def check_repeat_item(domain, img_path, page_summary, search_document, embedding_model):
+    if len(search_document[domain]) == 0:
+        return None, None
+    vectorstore = FAISS.from_documents(search_document[domain], embedding_model)
+    search_res = vectorstore.similarity_search(page_summary)
+
+    old_description = ""
+    for i, res in enumerate(search_res):
+        old_description += f'{i+1}. ' + res.page_content + '\n'
+    
+    check_repeat_prompt = prompts.check_repeat.format(old_description=old_description)
+    check_repeat_res = chat([img_path], check_repeat_prompt)
+    sample_index = check_repeat_res.split('### Index: ')[1].strip()#.split('\n')[0]
+    
+    if sample_index == 'None':
+        return None, None
+    else:
+        sample_index = int(sample_index) - 1
+        old_img_path = search_res[sample_index].metadata['img_path']
+        double_check_res = chat([old_img_path, img_path], prompts.check_repeat_2)
+        double_check_res = double_check_res.split('### Conclusion: ')[1].strip()
+        assert double_check_res in ['Yes','No']
+        if double_check_res == 'No':
+            return None, None
+        repeat_index = search_res[sample_index].metadata['index']
+        new_summary = search_res[sample_index].page_content#check_repeat_res.split('### New Summary: ')[1]
+        return new_summary, repeat_index
+
+
+
+def create_new_item(domain, img_path, knowledge_library, search_document, embedding_model):
+    page_summary = chat([img_path], prompts.page_summary)
+    new_summary, repeat_index = check_repeat_item(domain, img_path, page_summary, search_document, embedding_model)
+    if repeat_index is None:
+        knowledge_item = {}
+        knowledge_item['index'] = len(knowledge_library[domain])
+        knowledge_item['page_summary'] = page_summary#.split('### Page Summary: ')[1]
+        knowledge_item['original_image'] = []
+        knowledge_item['next_page_list'] = [{'actions':[],'page_index':None}]
+        knowledge_library[domain][knowledge_item['index']] = knowledge_item
+        search_document[domain].append(Document(page_content = page_summary, metadata = {"index": knowledge_item['index'], "img_path": img_path}))
+    else:
+        knowledge_library[domain][repeat_index]['page_summary'] = new_summary
+        search_document[domain][repeat_index].page_content = new_summary
+        knowledge_item = knowledge_library[domain][repeat_index]
+
+    return knowledge_item
+
+def get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model):
+    if last_page_idx is None:
+        knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
+        redirection_flag = True
+    else:
+        redirection_res = chat([last_img_path, img_path], prompts.redirection_judge.format(action=last_action_summary))
+        redirection_res = redirection_res.split('### Conclusion: ')[1].strip()
+        assert redirection_res in ['Yes','No']
+        if redirection_res == 'Yes':
+            knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
+            redirection_flag = True
+        elif redirection_res == 'No':
+            knowledge_item = knowledge_library[domain][last_page_idx]
+            redirection_flag = False
+    knowledge_item['original_image'].append(img_path.split('http://localhost:6667/mind2web_images/')[1])
+    return knowledge_item, redirection_flag
+
+
+
+mind2web_train_data = json.load(open('mind2web_annots/mind2web_data_train.json','r'))
+embedding_model_name = "bge-m3"
+embedding_model = HuggingFaceEmbeddings(model_name = embedding_model_name,model_kwargs={'device': 'cuda:0'})
+
+knowledge_library = {}
+search_document = {}
+selected_episode = random.sample(mind2web_train_data, len(mind2web_train_data) // 10)
+for episode in tqdm(selected_episode):
+    last_page_idx = None
+    last_img_path = None
+    last_action_summary = None
+    domain = episode['domain']
+    if domain not in list(knowledge_library.keys()):
+        knowledge_library[domain] = {}
+        search_document[domain] = []
+    goal = episode['confirmed_task']
+    episode_id = episode['annotation_id']
+    action_list = episode['actions']
+    terminate_flag = False
+    for i in range(len(action_list)): 
+        img_path = 'http://localhost:6667/mind2web_images/'+episode_id+'-'+action_list[i]['action_uid']+'.jpg' 
+
+        if not os.path.exists('mind2web_images/'+episode_id+'-'+action_list[i]['action_uid']+'.jpg'):
+            terminate_flag = True
+            print('IMAGE NOT FOUND')
+            print(episode_id+'-'+action_list[i]['action_uid'])
+            break
+         
+        if last_page_idx is not None:
+            last_action_summary = get_action_summary(last_img_path, action_list[i-1])
+        
+        knowledge_item, redirection_flag = get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model)
+
+        if last_page_idx is not None:
+            knowledge_library[domain][last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
+            knowledge_library[domain][last_page_idx]['next_page_list'][-1]['goal'] = goal
+            if redirection_flag:
+                knowledge_library[domain][last_page_idx]['next_page_list'][-1]['page_index'] = knowledge_item['index']
+                knowledge_library[domain][last_page_idx]['next_page_list'].append({'actions':[],'page_index':None})
+
+        last_page_idx = knowledge_item['index']
+        last_img_path = img_path
+    
+    if terminate_flag:
+        continue
+
+    if len(action_list) > 1:
+        last_action_summary = get_action_summary(last_img_path, action_list[-1])
+        knowledge_library[domain][last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
+        knowledge_library[domain][last_page_idx]['next_page_list'][-1]['goal'] = goal
+    
+    f_json = open(f'mind2web_library.json', 'w')
+    json.dump(knowledge_library, f_json, ensure_ascii=False, indent=4)
+    f_json.close()
+            
\ No newline at end of file
--- a/document_construction/mind2web_document/prompts.py
+++ b/document_construction/mind2web_document/prompts.py
+
+page_summary = 'Please describe this screen containing following content with one full sentence, including \
+the type of page, the function of page and the key components of the screen.'
+
+click_action_summary = 'This is a page of website. The user clicks the item at coordinates {bbox}. You are required to summarize this operation beginning with \"click\". Do not mention original coordinates.'
+
+type_action_summary = 'This is a page of website. The user types the content \"{content}\" at coordinates {bbox}. You are required to summarize this operation beginning with \"type\". Do not mention original coordinates.'
+
+select_action_summary = 'This is a page of website. The user opens a \"Select Menu\" or \"Dropdown List\" at coordinates {bbox}, and select the option \"{content}\". You are required to summarize this operation beginning with \"select\". Do not mention original coordinates.'
+
+
+redirection_judge = 'You will receive the images of screens before and after operation \'{action}\'. \
+You need to determine whether this operation leads to a new page, or it is just an in-page operation. \
+You are required to output with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Conclusion: <\'Yes\' or \'No\'>\n\
+Do not output anything else.'
+
+
+
+
+check_repeat = 'You are a professional GUI agent. You will be given a webpage and some descriptions. \
+Your task is to find one description that best fits the current webpage.\n\
+Here are the descriptions:\n\
+{old_description}\
+You should answer with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Index: <The index of chosen description, or \'None\' if none of them fits>\n\
+Do not output anything else.'
+
+
+check_repeat_2 = 'Are these two screens similar? You should consider the type, layout, and content of the pages comprehensively.\n\
+You are required to output with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Conclusion: <\'Yes\' or \'No\'>\n\
+Do not output anything else.'
+
--- a/document_construction/odyssey_document/main.py
+++ b/document_construction/odyssey_document/main.py
+# from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+# from qwen_vl_utils import process_vision_info
+import random
+
+# import cv2
+import copy
+import os
+from pathlib import Path
+from tqdm import tqdm
+import requests
+from urllib.parse import quote
+import json
+from tqdm import tqdm
+from langchain_community.vectorstores import FAISS
+from langchain.schema import Document
+from langchain_huggingface import HuggingFaceEmbeddings
+from PIL import Image
+import numpy as np
+
+import prompts
+
+
+url = "http://localhost:8000/v1/chat/completions"
+
+headers = {
+    "Content-Type": "application/json"
+}
+
+def chat(img_url_list: str = '', query: str = '') -> dict:
+
+    content = []
+    for img_url in img_url_list:
+        img_url = quote(img_url, safe='/:') 
+        content.append({"type": "image_url", "image_url": {"url": img_url}})
+    content.append({"type": "text", "text": query})
+    data = {
+        "model": "Qwen2.5-VL-72B-Instruct",
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": content}
+        ],
+        'temperature':0 
+         }
+
+    response = requests.post(url, headers=headers, data=json.dumps(data))
+    response = response.json()
+    response = response['choices'][0]['message']['content']
+
+    return response
+
+
+
+def get_action_summary(img_path, step):
+    action = step['action']
+    info = step['info']
+    assert action in ['CLICK', 'TEXT', 'SCROLL', 'LONG_PRESS']
+
+    if action == 'CLICK' or action == "LONG_PRESS":
+        if info == 'KEY_HOME':
+            gt = 'press home to go to the home screen'
+        elif info == 'KEY_BACK':
+            gt = 'press back to go to the previous screen'
+        elif info == 'KEY_APPSELECT':
+            gt = 'go to the previous App'
+        elif type(info) == list:
+
+            w, h = Image.open('GUI-Odyssey-master/data/screenshots/' + step['screenshot']).size
+            bbox_str = f'[{int(info[0][0]/1000*w)}, {int(info[0][1]/1000*h)}]'
+            query = prompts.click_action_summary.format(bbox=bbox_str)
+            gt = chat([img_path], query)
+            if gt[-1] == '.':
+                gt = gt[:-1]
+
+        else:
+            raise ValueError(f'Unknown click action {info}')
+
+    elif action == 'SCROLL':
+        start = np.array(info[0])
+        end = np.array(info[1])
+        delta = end - start
+        delta_abs = np.abs(delta)
+        lr = 'left' if delta[0] < 0 else 'right'
+        ud = 'up' if delta[1] < 0 else 'down'
+        if delta_abs[0] > delta_abs[1]:
+            gt = f"scroll {lr}"
+        else:
+            gt = f"scroll {ud}"
+
+    elif action == 'TEXT': 
+        gt = f'type {info}'
+
+    return gt 
+
+def check_repeat_item(domain, img_path, page_summary, search_document, embedding_model):
+    if len(search_document[domain]) == 0:
+        return None, None
+    vectorstore = FAISS.from_documents(search_document[domain], embedding_model)
+    search_res = vectorstore.similarity_search(page_summary)
+
+    old_description = ""
+    for i, res in enumerate(search_res):
+        old_description += f'{i+1}. ' + res.page_content + '\n'
+    
+    check_repeat_prompt = prompts.check_repeat.format(old_description=old_description)
+    check_repeat_res = chat([img_path], check_repeat_prompt)
+    sample_index = check_repeat_res.split('### Index: ')[1].strip()#.split('\n')[0]
+    
+    if sample_index == 'None':
+        return None, None
+    else:
+        sample_index = int(sample_index) - 1
+        old_img_path = search_res[sample_index].metadata['img_path']
+        double_check_res = chat([old_img_path, img_path], prompts.check_repeat_2)
+        double_check_res = double_check_res.split('### Conclusion: ')[1].strip()
+        assert double_check_res in ['Yes','No']
+        if double_check_res == 'No':
+            return None, None
+        repeat_index = search_res[sample_index].metadata['index']
+        new_summary = search_res[sample_index].page_content#check_repeat_res.split('### New Summary: ')[1]
+        return new_summary, repeat_index
+
+
+
+def create_new_item(domain, img_path, knowledge_library, search_document, embedding_model):
+    page_summary = chat([img_path], prompts.page_summary)
+    new_summary, repeat_index = check_repeat_item(domain, img_path, page_summary, search_document, embedding_model)
+    if repeat_index is None:
+        knowledge_item = {}
+        knowledge_item['index'] = len(knowledge_library[domain])
+        knowledge_item['page_summary'] = page_summary#.split('### Page Summary: ')[1]
+        knowledge_item['original_image'] = []
+        knowledge_item['next_page_list'] = [{'actions':[],'page_index':None}]
+        knowledge_library[domain][knowledge_item['index']] = knowledge_item
+        search_document[domain].append(Document(page_content = page_summary, metadata = {"index": knowledge_item['index'], "img_path": img_path}))
+    else:
+        knowledge_library[domain][repeat_index]['page_summary'] = new_summary
+        search_document[domain][repeat_index].page_content = new_summary
+        knowledge_item = knowledge_library[domain][repeat_index]
+
+    return knowledge_item
+
+def get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model):
+    if last_page_idx is None:
+        knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
+        redirection_flag = True
+    else:
+        redirection_res = chat([last_img_path, img_path], prompts.redirection_judge.format(action=last_action_summary))
+        redirection_res = redirection_res.split('### Conclusion: ')[1].strip()
+        assert redirection_res in ['Yes','No']
+        if redirection_res == 'Yes':
+            knowledge_item = create_new_item(domain, img_path, knowledge_library, search_document, embedding_model)
+            redirection_flag = True
+        elif redirection_res == 'No':
+            knowledge_item = knowledge_library[domain][last_page_idx]
+            redirection_flag = False
+    knowledge_item['original_image'].append(img_path.split('http://localhost:6668/')[1])
+    return knowledge_item, redirection_flag
+
+
+odyssey_data = json.load(open('data/splits/splits_random_split.json','r'))
+annotations_path = 'data/annotations/'
+imgs_path = 'data/screenshots/'
+embedding_model_name = "bge-m3"
+embedding_model = HuggingFaceEmbeddings(model_name = embedding_model_name,model_kwargs={'device': 'cuda:0'})
+
+knowledge_library = {}
+search_document = {}
+selected_episode_idx = random.sample(odyssey_data['train'], len(odyssey_data['train']) // 50)
+for train_idx in tqdm(selected_episode_idx):
+    episode = json.load(open(annotations_path + train_idx,'r'))
+    last_page_idx = None
+    last_img_path = None
+    last_action_summary = None
+    domain = episode['task_info']['category']
+    if domain not in list(knowledge_library.keys()):
+        knowledge_library[domain] = {}
+        search_document[domain] = []
+    goal = episode['task_info']['instruction']
+    action_list = episode['steps']
+    for i in range(len(action_list)): 
+        img_path = 'http://localhost:6668/'+action_list[i]['screenshot']
+         
+        if last_page_idx is not None:
+            last_action_summary = get_action_summary(last_img_path, action_list[i-1])
+        
+        knowledge_item, redirection_flag = get_item(domain, img_path, last_img_path, last_action_summary, last_page_idx, knowledge_library, search_document, embedding_model)
+
+        if last_page_idx is not None:
+            knowledge_library[domain][last_page_idx]['next_page_list'][-1]['actions'].append(last_action_summary)
+            knowledge_library[domain][last_page_idx]['next_page_list'][-1]['goal'] = goal
+            if redirection_flag:
+                knowledge_library[domain][last_page_idx]['next_page_list'][-1]['page_index'] = knowledge_item['index']
+                knowledge_library[domain][last_page_idx]['next_page_list'].append({'actions':[],'page_index':None})
+
+        last_page_idx = knowledge_item['index']
+        last_img_path = img_path
+    
+    
+    f_json = open(f'odyssey_library.json', 'w')
+    json.dump(knowledge_library, f_json, ensure_ascii=False, indent=4)
+    f_json.close()
+            
--- a/document_construction/odyssey_document/prompts.py
+++ b/document_construction/odyssey_document/prompts.py
+
+page_summary = 'Please describe this screen containing following content with one full sentence, including \
+the type of page, the function of page and the key components of the screen.'
+
+click_action_summary = 'The user clicks the item at coordinates {bbox}. You are required to summarize this operation with a verb phrase that begins with \"click\". Do not mention original coordinates.'
+
+
+redirection_judge = 'You will receive the images of screens before and after operation \'{action}\'. \
+You need to determine whether this operation leads to a new page, or it is just an in-page operation. \
+You are required to output with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Conclusion: <\'Yes\' or \'No\'>\n\
+Do not output anything else.'
+
+
+
+
+check_repeat = 'You are a professional GUI agent. You will be given a screen and some descriptions. \
+Your task is to find one description that best fits the current page.\n\
+Here are the descriptions:\n\
+{old_description}\
+You should answer with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Index: <The index of chosen description, or \'None\' if none of them fits>\n\
+Do not output anything else.'
+
+check_repeat_2 = 'Are these two screens similar? You should consider the type, layout, and content of the pages comprehensively.\n\
+You are required to output with the following format:\n\
+### Thought: <Generate your thinking process briefly>\n\
+### Conclusion: <\'Yes\' or \'No\'>\n\
+Do not output anything else.'
+
--- a/document_construction/pre.sh
+++ b/document_construction/pre.sh
+
+pip install langchain
+conda install -c pytorch faiss-gpu
+pip install -U langchain-community
+pip install sentence-transformers
+pip install numpy==1.23.2
+pip install -U langchain-huggingface
+pip install jax
+pip install jaxlib
+pip install --upgrade  vllm
+
+python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2.5-VL-72B-Instruct --model  Qwen2.5-VL-72B-Instruct -tp 4 --limit_mm_per_prompt image=2
--- a/workflow/aitw/action_matching.py
+++ b/workflow/aitw/action_matching.py
--- a/workflow/aitw/aitw_test.py
+++ b/workflow/aitw/aitw_test.py
--- a/workflow/aitw/prompts.py
+++ b/workflow/aitw/prompts.py
+AITW_ACTION_SPACE = '''
+1. Click(x, y):  An action of click a coordinate point on the smartphone screen and x,y is the position of the coordinate point on the screen.
+Your click location should be a UI element or text on the screen.
+A simple use case could be Click(100,238), which means you click the UI element at (100,238) on the current screen.
+
+2. Type("typed_text"): An action of typing a piece of text.
+A simple use case can be text("Hello, world!"), which inserts the string "Hello, world!" into the input area on the smartphone screen.
+
+3. Scroll("direction"): This function is used to scroll the screen to a specific direction.
+"direction" is a string that represents one of the four directions: "up", "down", "left", "right".
+A simple use case could be Scroll("up"), which means you take a scroll up action on the current screen.
+
+4. Back(): The action for returning to the previous step.
+
+5. Home(): The action for returning to the homepage.
+
+6. Enter(): The action of pressing the ENTER key to submit input content.
+
+7. Complete: It means you think the task is complete.
+'''
+
+
+AITW_OBSERVATION_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
+You will be given user's ultimate purpose and the previous actions that you have taken.
+Your task is to carefully observe the screen, descripe it and conclude some useful clues in one sentence.
+
+Now you can start to observe:
+
+### User's purpose ###
+<goal>
+
+### History trajectory ###
+History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
+<history>
+
+### Observation ###
+"""
+
+AITW_PLANNING_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
+Your task is to plan the next action to complete user's purpose with the help of references.
+
+I will give you several important information:
+### User's purpose ###
+This is the user's global purpose, and your goal is to complete it:
+<goal>
+
+### Observation ###
+This is the observation of the screen and some useful clues that help you plan:
+<observation>
+
+### Global Plan ###
+This is the global plan for completing user's purpose:
+<global_plan>
+
+### History trajectory ###
+History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
+<history>
+
+### Reference ###
+There are some reference actions that you can follow:
+<reference>
+
+Based on given information, you are required to output with following format:
+1. <Please decide which sub-goal in the \"### Global Plan ###\" should be executed based on the screen image>
+2. <Check if the user's global purpose has been completed. If the current screen state matches the user's global purpose, directly suggest that the task has been completed>
+3. <If the global purpose is not completed: Inspired by \"### Reference ###\", you can list some actions than can possibly push the task progress or complete the goal>
+"""
+
+AITW_EXECUTION_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface.
+You will be given a smartphone screenshot and a plan that you decide to take.
+
+Before you start, I will explain the data format:
+### Plan ###
+This is your plan:
+<action_plan>
+
+### Action Space ###
+These are the functions to interact with the phone:
+{AITW_ACTION_SPACE}
+
+### Reference ###
+There are some reference actions that you can follow:
+<reference>
+
+Now please choose one action in \"### Action Space ###\" for the current screen state based on \"### Plan ###\" and \"### Reference ###\".
+You should output with following format:
+
+### Thought ###
+According to \"### Plan ###\", you should first determine weather the purpose has been completed. If not, think step-by-step and output the action that should be taken currently.
+
+### Action ###
+The action you finally choose from \"### Action Space ###\". Do not output anything else.
+"""
+
+AITW_GLOBAL_PLANNING_PROMT = f'''
+You are an agent that is trained to complete certain tasks on a smartphone. You will be given a screenshot of a smartphone app.
+
+The global task you should complete is:
+\"<goal>\"
+
+Now, carefully analyze all the above content and provide your output in the following format:
+
+### Global Plan ###
+Please break down the overall task into 2~3 simple sub-goals.
+Note that since you can’t see future phone screenshots, each sub-goal should be abstract, high-level, and not involve interacting with specific UI elements.
+'''
+
+
+
+PAGE_SUMMARY_PROMPT = 'Please describe this screen containing following content with one full sentence: \
+the type of page, the function of page and a few key components of the screen.'
+
+
+REFERENCE_FORMAT = '''{idx}. 
+You can take following action: {actions}.
+This can help you achieve goals like: {goals}.
+'''
+
+ACTION_SUMMARY_PROMPT = 'A click operation has now been performed at coordinates {coordinates}. \
+You are required to summarize this operation with a verb phrase.'
+
+
+
--- a/workflow/mind2web/action_matching.py
+++ b/workflow/mind2web/action_matching.py
--- a/workflow/mind2web/mind2web_test.py
+++ b/workflow/mind2web/mind2web_test.py
--- a/workflow/mind2web/prompts.py
+++ b/workflow/mind2web/prompts.py
+
+MIND2WEB_ACTION_SPACE='''
+1. Click(x,y):  An action of clicking a coordinate point on the web screen and x,y is the position of the coordinate point on the screen.
+Your click location should be a UI element or text on the screen. 
+A simple use case could be Click(100,238), which means you click the UI element at (100,238) on the current screen.
+
+2. Type(x,y,"typed_text"): An action of typing a piece of text at the positon with coordinates x and y. 
+A simple use case could be Type(340,212,"Where was Obama born?"), which inputs the string "Where was Obama born?" into the input area at the cordinates (340,212) on the web screen. 
+
+3. Select(x,y,"option"): An action of opening a \"Select Menu\" or \"Dropdown List\" located at coordinates (x, y) and choose an option you specify.
+A simple use case could be Select(679,437,"female"), which opens the list at the coordinates (679,437) and select the option "female" from the list.
+'''
+
+MIND2WEB_OBSERVATION_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
+You will be given user's ultimate purpose and the previous actions that you have taken.
+Your task is to carefully observe the screen, descripe it and conclude some useful clues in one sentence.
+
+Now you can start to observe:
+
+### User's purpose ###
+<goal>
+
+### History trajectory ###
+History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
+<history>
+
+### Observation ###
+"""
+
+MIND2WEB_GLOBAL_PLANNING_PROMT = f'''
+You are an agent that is trained to complete certain tasks on the webpage. You will be given a screenshot of a website.
+
+The global task you should complete is:
+\"<goal>\"
+
+Now, carefully analyze all the above content and provide your output in the following format:
+
+### Global Plan ###
+Please break down the overall task into 2~3 simple sub-goals.
+Note that since you can’t see future webpages, each sub-goal should be abstract, high-level, and not involve interacting with specific UI elements.
+'''
+
+MIND2WEB_PLANNING_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
+Your task is to plan the next action to complete user's purpose with the help of references.
+
+I will give you several important information:
+### User's purpose ###
+This is the user's global purpose, and your goal is to complete it:
+<goal>
+
+### Observation ###
+This is the observation of the screen and some useful clues that help you plan:
+<observation>
+
+### Global Plan ###
+This is the global plan for completing user's purpose:
+<global_plan>
+
+### History trajectory ###
+History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
+<history>
+
+### Reference ###
+There are some reference actions that you can follow:
+<reference>
+
+Based on given information, you are required to output with following format:
+1. <Please decide which sub-goal in the \"### Global Plan ###\" should be executed based on the screen image>
+2. <Inspired by \"### Reference ###\", you can list some actions than can possibly push the task progress or complete the goal>
+"""
+
+MIND2WEB_EXECUTION_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface.
+You will be given a screenshot of a website and a plan that you decide to take.
+
+Before you start, I will explain the data format:
+### Plan ###
+This is your plan:
+<action_plan>
+
+### Reference ###
+There are some reference actions that you can follow:
+<reference>
+
+### Action Space ###
+These are the functions to interact with the webpage:
+{MIND2WEB_ACTION_SPACE}
+
+Now please choose one action in \"### Action Space ###\" for the current webpage based on \"### Plan ###\" and \"### Reference ###\".
+You should output with following format:
+
+### Thought ###
+Think step-by-step and output the action that should be taken currently.
+
+### Action ###
+Output only one action you finally choose from \"### Action Space ###\". Do not output anything else.
+"""
+
+
+ACTION_SUMMARY_PROMPT = {
+'click_action_summary' : 'This is a page of website. The user clicks the item at coordinates {bbox}. You are required to summarize this operation beginning with \"click\". Do not mention original coordinates.',
+'type_action_summary' : 'This is a page of website. The user types the content \"{content}\" at coordinates {bbox}. You are required to summarize this operation beginning with \"type\". Do not mention original coordinates.',
+'select_action_summary' : 'This is a page of website. The user opens a \"Select Menu\" or \"Dropdown List\" at coordinates {bbox}, and select the option \"{content}\". You are required to summarize this operation beginning with \"select\". Do not mention original coordinates.'
+}
+
+PAGE_SUMMARY_PROMPT = 'Please describe this screen containing following content with one full sentence, including \
+the type of page, the function of page and the key components of the screen.'
+
+REFERENCE_FORMAT = '''{idx}. 
+You can take following action: {actions}.
+This can help you achieve goals like: {goals}.
+'''
\ No newline at end of file
--- a/workflow/odyssey/action_matching.py
+++ b/workflow/odyssey/action_matching.py
--- a/workflow/odyssey/odyssey_test.py
+++ b/workflow/odyssey/odyssey_test.py
--- a/workflow/odyssey/prompts.py
+++ b/workflow/odyssey/prompts.py
+
+ODYSSEY_ACTION_SPACE = '''
+1. 'CLICK: (x,y)': An action of clicking a coordinate point on the smartphone screen and x,y is the position of the coordinate point on the screen.
+Your click location should be a UI element or text on the screen.
+A simple use case could be 'CLICK: (100,238)', which means you click the UI element at (100,238) on the current screen.
+
+2. 'TYPE: typed_text': An action of typing a piece of text.
+A simple use case can be 'TYPE: Hello, world!', which inserts the string "Hello, world!" into the input area on the smartphone screen.
+
+3. 'SCROLL: direction': This function is used to scroll an UI element shown on the smartphone screen, usually a scroll view or a slide bar.
+"direction" is a string that represents one of the four directions: UP, DOWN, LEFT, RIGHT.
+A simple use case could be 'SCROLL: UP', which means you take a scroll up action on the current screen.
+
+4. 'PRESS_BACK': The action for returning to the previous screen.
+
+5. 'PRESS_HOME': The action for returning to the homepage.
+
+6. 'PRESS_RECENT': The action to go to the previous App.
+
+7. 'COMPLETE': It means you think the task has been completed based on current screen.
+
+8. 'IMPOSSIBLE': It means you think the task cannot be completed based on current screen.
+
+9. 'LONG_PRESS: (x,y)': An action of pressing a coordinate point on the smartphone screen for a long time to copy texts or download images, where x and y is the position of the coordinate point on the screen.
+'''
+
+ODYSSEY_OBSERVATION_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
+You will be given user's ultimate purpose and the previous actions that you have taken.
+Your task is to carefully observe the screen, descripe it and conclude some useful clues in one sentence.
+
+Now you can start to observe:
+
+### User's purpose ###
+<goal>
+
+### History trajectory ###
+History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
+<history>
+
+### Observation ###
+"""
+
+ODYSSEY_GLOBAL_PLANNING_PROMT = f'''
+You are an agent that is trained to complete certain tasks on a smartphone. You will be given a screenshot of a smartphone app.
+
+The global task you should complete is:
+\"<goal>\"
+
+Now, carefully analyze all the above content and provide your output in the following format:
+
+### Global Plan ###
+Please break down the overall task into 2~3 simple sub-goals.
+Note that since you can’t see future phone screenshots, each sub-goal should be abstract, high-level, and not involve interacting with specific UI elements.
+'''
+
+ODYSSEY_PLANNING_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface as well as the user's intentions.
+Your task is to plan the next action to complete user's purpose with the help of references.
+
+I will give you several important information:
+### User's purpose ###
+This is the user's global purpose, and your goal is to complete it:
+<goal>
+
+### Observation ###
+This is the observation of the screen and some useful clues that help you plan:
+<observation>
+
+### Global Plan ###
+This is the global plan for completing user's purpose:
+<global_plan>
+
+### History trajectory ###
+History trajectory can remind you of the operations that have been executed before, thus avoiding repetitive actions.
+<history>
+
+### Reference ###
+There are some reference actions that you can follow:
+<reference>
+
+Based on given information, you are required to output with following format:
+1. <Please decide which sub-goal in the \"### Global Plan ###\" should be executed based on the screen image>
+2. <Check if the user's global purpose has been completed. If the current screen state matches the user's global purpose, directly suggest that the task has been completed>
+3. <If the global purpose is not completed: Inspired by \"### Reference ###\", you can list some actions than can possibly push the task progress or complete the goal>
+"""
+
+ODYSSEY_EXECUTION_PROMT = f"""
+You are a smart GUI agent, capable of comprehensively understanding the GUI interface.
+You will be given a smartphone screenshot and a plan that you decide to take.
+
+Before you start, I will explain the data format:
+### Plan ###
+This is your plan:
+<action_plan>
+
+### Action Space ###
+These are the functions to interact with the phone:
+{ODYSSEY_ACTION_SPACE}
+
+### Reference ###
+There are some reference actions that you can follow:
+<reference>
+
+Now please choose one action in \"### Action Space ###\" for the current screen state based on \"### Plan ###\" and \"### Reference ###\".
+You should output with following format:
+
+### Thought ###
+According to \"### Plan ###\", you should first determine weather the purpose has been completed. If not, think step-by-step and output the action that should be taken currently.
+
+### Action ###
+The action you finally choose from \"### Action Space ###\". Do not output anything else.
+"""
+
+REFERENCE_FORMAT = '''{idx}. 
+You can take following action: {actions}.
+This can help you achieve goals like: {goals}.
+'''
+
+PAGE_SUMMARY_PROMPT = 'Please describe this screen containing following content with one full sentence, including \
+the type of page, the function of page and the key components of the screen.'
+
+
+
+ACTION_SUMMARY_PROMPT = 'The user clicks the item at coordinates {bbox}. You are required to summarize this operation with a verb phrase that begins with \"click\". Do not mention original coordinates.'
--- a/workflow/pre.sh
+++ b/workflow/pre.sh
+pip install langchain_community
+pip install langchain_huggingface
+pip install jax
+pip install jaxlib
+pip install faiss-gpu
+pip install sentence-transformers
+
+python -m vllm.entrypoints.openai.api_server --served-model-name Qwen2.5-VL-72B-Instruct --model  Qwen2.5-VL-72B-Instruct -tp 4 --limit_mm_per_prompt image=2