|
13 | 13 | from rich.console import Console |
14 | 14 | from rich.text import Text |
15 | 15 |
|
| 16 | +from open_instruct.utils import GCP_CLUSTERS, INTERCONNECT_CLUSTERS, WEKA_CLUSTERS |
| 17 | + |
16 | 18 | console = Console() |
17 | 19 |
|
18 | 20 |
|
@@ -87,11 +89,6 @@ def parse_env_var(env_var_str: str) -> dict[str, str]: |
87 | 89 | return {"name": name, "value": value} |
88 | 90 |
|
89 | 91 |
|
90 | | -WEKA_CLUSTERS = ["ai2/jupiter", "ai2/saturn", "ai2/titan", "ai2/neptune", "ai2/ceres", "ai2/triton", "ai2/rhea"] |
91 | | -GCP_CLUSTERS = ["ai2/augusta"] |
92 | | - |
93 | | -INTERCONNECT_CLUSTERS = ["ai2/jupiter", "ai2/ceres", "ai2/titan", "ai2/augusta"] |
94 | | - |
95 | 92 | # by default, we turn off vllm compile cache |
96 | 93 | # torch compile caching seems consistently broken, but the actual compiling isn't. |
97 | 94 | # Not sure why, for now we have disabled the caching (VLLM_DISABLE_COMPILE_CACHE=1). |
@@ -589,24 +586,29 @@ def make_internal_command(command: list[str], args: argparse.Namespace, whoami: |
589 | 586 | model_revision = command[idx + 1] |
590 | 587 | break |
591 | 588 |
|
592 | | - commit_hash = get_commit_hash(model_name_or_path, model_revision, "config.json", "model") |
593 | | - if os.path.exists(model_name_or_path): |
594 | | - path = model_name_or_path |
595 | | - assert args.gs_model_name is not None, "for local models to upload to gs, you must set --gs_model_name" |
596 | | - model_name_or_path = args.gs_model_name |
597 | | - commit_hash = hashlib.md5(model_name_or_path.encode("utf-8")).hexdigest()[:8] |
598 | | - console.log( |
599 | | - f"Local model is already downloaded, using gs_model_name {model_name_or_path}, with hash of model path {commit_hash}" |
600 | | - ) |
| 589 | + if model_name_or_path.startswith("gs://"): |
| 590 | + gs_saved_path = model_name_or_path |
601 | 591 | else: |
602 | | - download_from_hf(model_name_or_path, model_revision) # first download the model |
603 | | - path = download_from_hf(model_name_or_path, model_revision) # then get the path |
604 | | - gs_saved_path = f"gs://ai2-llm/post-training/deletable_cache_models/{model_name_or_path}/{commit_hash}" |
605 | | - gs_folder = gs_folder_exists( |
606 | | - gs_saved_path |
607 | | - ) # race condition exists, but it's fine since we are launching mason sequentially |
608 | | - if not gs_folder: |
609 | | - upload_to_gs_bucket(path, gs_saved_path) |
| 592 | + commit_hash = get_commit_hash(model_name_or_path, model_revision, "config.json", "model") |
| 593 | + if os.path.exists(model_name_or_path): |
| 594 | + path = model_name_or_path |
| 595 | + assert args.gs_model_name is not None, ( |
| 596 | + "for local models to upload to gs, you must set --gs_model_name" |
| 597 | + ) |
| 598 | + model_name_or_path = args.gs_model_name |
| 599 | + commit_hash = hashlib.md5(model_name_or_path.encode("utf-8")).hexdigest()[:8] |
| 600 | + console.log( |
| 601 | + f"Local model is already downloaded, using gs_model_name {model_name_or_path}, with hash of model path {commit_hash}" |
| 602 | + ) |
| 603 | + else: |
| 604 | + download_from_hf(model_name_or_path, model_revision) # first download the model |
| 605 | + path = download_from_hf(model_name_or_path, model_revision) # then get the path |
| 606 | + gs_saved_path = f"gs://ai2-llm/post-training/deletable_cache_models/{model_name_or_path}/{commit_hash}" |
| 607 | + gs_folder = gs_folder_exists( |
| 608 | + gs_saved_path |
| 609 | + ) # race condition exists, but it's fine since we are launching mason sequentially |
| 610 | + if not gs_folder: |
| 611 | + upload_to_gs_bucket(path, gs_saved_path) |
610 | 612 |
|
611 | 613 | download_path = gs_saved_path.replace("gs://", "/gs/") |
612 | 614 | download_path_without_last_folder = download_path.rsplit("/", 1)[0] |
|
0 commit comments