Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions tests/models/phi/test_modeling_phi.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,12 @@ class PhiModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
test_headmasking = False
test_pruning = False

# TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
def is_pipeline_test_to_skip(
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
):
return True
Comment on lines +291 to +295
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missed in the hotfix #27687


# Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Phi
def setUp(self):
self.model_tester = PhiModelTester(self)
Expand Down
126 changes: 121 additions & 5 deletions utils/tests_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@

import argparse
import collections
import importlib.util
import json
import os
import re
import tempfile
from contextlib import contextmanager
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union
Expand Down Expand Up @@ -254,6 +256,122 @@ def diff_contains_doc_examples(repo: Repo, branching_point: str, filename: str)
return old_content_clean != new_content_clean


def get_impacted_files_from_tiny_model_summary(diff_with_last_commit: bool = False) -> List[str]:
"""
Return a list of python modeling files that are impacted by the changes of `tiny_model_summary.json` in between:

- the current head and the main branch if `diff_with_last_commit=False` (default)
- the current head and its parent commit otherwise.

Returns:
`List[str]`: The list of Python modeling files that are impacted by the changes of `tiny_model_summary.json`.
"""
repo = Repo(PATH_TO_REPO)

folder = Path(repo.working_dir)

if not diff_with_last_commit:
print(f"main is at {repo.refs.main.commit}")
print(f"Current head is at {repo.head.commit}")

commits = repo.merge_base(repo.refs.main, repo.head)
for commit in commits:
print(f"Branching commit: {commit}")
else:
print(f"main is at {repo.head.commit}")
commits = repo.head.commit.parents
for commit in commits:
print(f"Parent commit: {commit}")

if not os.path.isfile(folder / "tests/utils/tiny_model_summary.json"):
return []

files = set()
for commit in commits:
with checkout_commit(repo, commit):
with open(folder / "tests/utils/tiny_model_summary.json", "r", encoding="utf-8") as f:
old_content = f.read()

with open(folder / "tests/utils/tiny_model_summary.json", "r", encoding="utf-8") as f:
new_content = f.read()

# get the content as json object
old_content = json.loads(old_content)
new_content = json.loads(new_content)

old_keys = set(old_content.keys())
new_keys = set(new_content.keys())
Comment on lines +302 to +303
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keys are always a set

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. 'dict_keys' is not the same as set: 'dict_keys' object has no attribute 'intersection'

a = {1: 2, 2: 3}
b = {2: 3, 3: 4}

a.keys()
b.keys()

c1 = set(a.keys()).intersection(set(b.keys()))
print(c1)


c2 = a.keys().intersection(b.keys())
print(c2)

gives

{2}
Traceback (most recent call last):
  File "C:\Users\33611\Desktop\Project\transformers\temp3.py", line 11, in <module>
    c2 = a.keys().intersection(b.keys())
AttributeError: 'dict_keys' object has no attribute 'intersection'

Process finished with exit code 1

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright! Thanks


# get the difference
keys_with_diff = old_keys.symmetric_difference(new_keys)
common_keys = old_keys.intersection(new_keys)
# if both have the same key, check its content
for key in common_keys:
if old_content[key] != new_content[key]:
keys_with_diff.add(key)

# get the model classes
impacted_model_classes = []
for key in keys_with_diff:
if key in new_keys:
impacted_model_classes.extend(new_content[key]["model_classes"])

# get the module where the model classes are defined. We want to use the main `__init__` file, but it requires
# all the framework being installed, which is not ideal for a simple script like test fetcher.
# So we create a temporary and modified main `__init__` and access its `_import_structure`.
Comment on lines +319 to +321
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not use inspect on the class? The transformers packages should be installed and

>>> import inspect
>>> from transformers import *
>>> inspect.getfile(globals()["CLIPConfig"])
 '/Users/arthurzucker/Work/transformers/src/transformers/models/clip/configuration_clip.py'

we can split on src and be done with it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using inspect will make this test_fethcer require all the dependency being installed, otherwise one will get a dummy object, also the process will become slower due to all the importing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mmm but it's more resilient less error prone. Agreed tho that we would need every tf flax and torch dependency which is not optimal.

Copy link
Collaborator Author

@ydshieh ydshieh Nov 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

more resilient less error prone

I agree too.

I don't think the community would run test fetcher. So I am fine if we decide to use inspect (on the CI). Two things to consider:

  • On CircleCI: the fetch test job might take longer (bigger cache to download, longer installation if cache miss)
  • On Push CI: the results would look a bit strange/confusing, as Flax is not installed in our CI docker image.

Copy link
Collaborator

@ArthurZucker ArthurZucker Nov 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understood correctly we need dependencies anyway on the machine that will run the tests no? So what we can do is:

  • load the object
  • if it's a dummy, log that dummy object and don't use it (will help you with debugging). That means we don't have the deps on the machine
  • else, we have the deps, all good

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you confirm that it's ok to install transformers[dev] in the CircleCI job fetch_tests in .circleci/config.yml, then I am OK with the suggestions :-)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

currently , test fetcher run in ~ 1m30.

Loading cache of the whole .[dev] will take 5 minutes + a bit more for all the importing

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fetching the test should be fast, so would just do this job last. I thought we had a different machine running the pipeline test and would say let's forward getting the files to update over there if there are any changes to the tiny model!
Cache is so slow :(

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forward getting the files to update over there

If the commit(s) in a PR only modifies the tests/utils/tiny_model_summary.json, then fetch_tests in .circleci/config.yml will give an empty job. So there is no job (like torch_job / tf_job) to perform getting the files to update, and we will miss the desired pipeline tests to run.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the offline discussion. Yes, your suggestion is a good possibility (other than the fact the file .circleci/create_circleci_config.py might get a bit more complex in order to get the actual tests to run dynamically).

As mentioned, I proposed to merge the current version, and I will work in a follow up PR to implement your suggested idea. So at the end, if there is any unexpected undesired issue, we can at least have a working version.

with open(folder / "src/transformers/__init__.py") as fp:
lines = fp.readlines()
new_lines = []
# Get all the code related to `_import_structure`
for line in lines:
if line == "_import_structure = {\n":
new_lines.append(line)
elif line == "# Direct imports for type-checking\n":
break
elif len(new_lines) > 0:
# bypass the framework check so we can get all the information even if frameworks are not available
line = re.sub(r"is_.+_available\(\)", "True", line)
line = line.replace("OptionalDependencyNotAvailable", "Exception")
line = line.replace("Exception()", "Exception")
new_lines.append(line)

# create and load the temporary module
with tempfile.TemporaryDirectory() as tmpdirname:
with open(os.path.join(tmpdirname, "temp_init.py"), "w") as fp:
fp.write("".join(new_lines))

spec = importlib.util.spec_from_file_location("temp_init", os.path.join(tmpdirname, "temp_init.py"))
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Finally, get `_import_structure` that we need
import_structure = module._import_structure

# map model classes to their defined module
reversed_structure = {}
for key, values in import_structure.items():
for value in values:
reversed_structure[value] = key

# Get the corresponding modeling file path
for model_class in impacted_model_classes:
module = reversed_structure[model_class]
framework = ""
if model_class.startswith("TF"):
framework = "tf"
elif model_class.startswith("Flax"):
framework = "flax"
fn = (
f"modeling_{module.split('.')[-1]}.py"
if framework == ""
else f"modeling_{framework}_{module.split('.')[-1]}.py"
)
files.add(
f"src.transformers.{module}.{fn}".replace(".", os.path.sep).replace(f"{os.path.sep}py", ".py")
)

return sorted(files)


def get_diff(repo: Repo, base_commit: str, commits: List[str]) -> List[str]:
"""
Get the diff between a base commit and one or several commits.
Expand Down Expand Up @@ -949,18 +1067,16 @@ def infer_tests_to_run(
if any(x in modified_files for x in ["setup.py", ".circleci/create_circleci_config.py"]):
test_files_to_run = ["tests", "examples"]
repo_utils_launch = True
# in order to trigger pipeline tests even if no code change at all
elif "tests/utils/tiny_model_summary.json" in modified_files:
test_files_to_run = ["tests"]
repo_utils_launch = any(f.split(os.path.sep)[0] == "utils" for f in modified_files)
Comment on lines -952 to -955
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not working as modified_files is computed by get_modified_python_files(diff_with_last_commit=diff_with_last_commit), so tiny_model_summary is never in that list.

My bad, sorry

else:
# All modified tests need to be run.
test_files_to_run = [
f for f in modified_files if f.startswith("tests") and f.split(os.path.sep)[-1].startswith("test")
]
impacted_files = get_impacted_files_from_tiny_model_summary(diff_with_last_commit=diff_with_last_commit)

# Then we grab the corresponding test files.
test_map = create_module_to_test_map(reverse_map=reverse_map, filter_models=filter_models)
for f in modified_files:
for f in modified_files + impacted_files:
if f in test_map:
test_files_to_run.extend(test_map[f])
test_files_to_run = sorted(set(test_files_to_run))
Expand Down