Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions FlagEmbedding/abc/inference/AbsEmbedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,8 @@ def _concatenate_results_from_multi_process(self, results_list: List[Union[torch
Union[torch.Tensor, np.ndarray]: return the embedding vectors in a numpy array or tensor.
"""
if isinstance(results_list[0], torch.Tensor):
# move all tensors to the same device
results_list = [res.to(self.target_devices[0]) for res in results_list]
return torch.cat(results_list, dim=0)
elif isinstance(results_list[0], np.ndarray):
return np.concatenate(results_list, axis=0)
Expand Down
16 changes: 11 additions & 5 deletions FlagEmbedding/finetune/embedder/decoder_only/base/load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re
config = AutoConfig.from_pretrained(
model_args.config_name,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
else:
raise ValueError(
Expand All @@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir: str, re
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
logger.info("Training new model from scratch")
Expand Down Expand Up @@ -129,13 +132,15 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
config = AutoConfig.from_pretrained(
model_args.config_name,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
else:
raise ValueError(
Expand All @@ -152,6 +157,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
model = model_args.from_config(config)
Expand All @@ -173,5 +179,5 @@ def save_merged_model(model_args: DecoderOnlyEmbedderModelArguments, output_dir:

model.save_pretrained(os.path.join(output_dir, 'merged_model'))

tokenizer = AutoTokenizer.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code)
tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model'))
3 changes: 2 additions & 1 deletion FlagEmbedding/finetune/embedder/decoder_only/base/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
token=self.model_args.token,
cache_dir=self.model_args.cache_dir,
use_fast=False,
add_eos_token=True
add_eos_token=True,
trust_remote_code=self.model_args.trust_remote_code,
)

if tokenizer.pad_token is None:
Expand Down
10 changes: 7 additions & 3 deletions FlagEmbedding/finetune/embedder/decoder_only/icl/load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,15 @@ def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_dir: str,
config = AutoConfig.from_pretrained(
model_args.config_name,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
elif model_args.model_name_or_path:
config = AutoConfig.from_pretrained(
model_args.model_name_or_path,
token=model_args.token,
cache_dir=model_args.cache_dir
cache_dir=model_args.cache_dir,
trust_remote_code=model_args.trust_remote_code,
)
else:
raise ValueError(
Expand All @@ -74,6 +76,7 @@ def get_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_dir: str,
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
logger.info("Training new model from scratch")
Expand Down Expand Up @@ -152,6 +155,7 @@ def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d
cache_dir=model_args.cache_dir,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
trust_remote_code=model_args.trust_remote_code,
)
else:
model = model_args.from_config(config)
Expand All @@ -173,5 +177,5 @@ def save_merged_model(model_args: DecoderOnlyEmbedderICLModelArguments, output_d

model.save_pretrained(os.path.join(output_dir, 'merged_model'))

tokenizer = AutoTokenizer.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir, trust_remote_code=model_args.trust_remote_code)
tokenizer.save_pretrained(os.path.join(output_dir, 'merged_model'))
3 changes: 2 additions & 1 deletion FlagEmbedding/finetune/embedder/decoder_only/icl/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
token=self.model_args.token,
cache_dir=self.model_args.cache_dir,
use_fast=False,
add_eos_token=True
add_eos_token=True,
trust_remote_code=self.model_args.trust_remote_code,
)

if tokenizer.pad_token is None:
Expand Down
8 changes: 8 additions & 0 deletions FlagEmbedding/finetune/embedder/encoder_only/m3/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ def load_tokenizer_and_model(self) -> Tuple[PreTrainedTokenizer, AbsEmbedderMode
if "position_embeddings" in k:
logging.info(f"Freeze the parameters for {k}")
v.requires_grad = False

if self.training_args.fix_encoder:
for k, v in model.named_parameters():
if "colbert_linear" in k or 'sparse_linear' in k:
logging.info(f"train the parameters for {k}")
else:
v.requires_grad = False

return tokenizer, model

def load_trainer(self) -> EncoderOnlyEmbedderM3Trainer:
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/embedder/decoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def encode_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
4 changes: 2 additions & 2 deletions FlagEmbedding/inference/embedder/decoder_only/icl.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ def encode_queries_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down Expand Up @@ -519,7 +519,7 @@ def encode_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/embedder/encoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def encode_single_device(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/embedder/encoder_only/m3.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,7 +406,7 @@ def _process_colbert_vecs(colbert_vecs: np.ndarray, attention_mask: list):
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

# encode
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/reranker/decoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

dataset, dataloader = None, None
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/reranker/decoder_only/layerwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

dataset, dataloader = None, None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

all_scores = []
Expand Down
2 changes: 1 addition & 1 deletion FlagEmbedding/inference/reranker/encoder_only/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def compute_score_single_gpu(
flag = True
except RuntimeError as e:
batch_size = batch_size * 3 // 4
except torch.OutofMemoryError as e:
except torch.OutOfMemoryError as e:
batch_size = batch_size * 3 // 4

all_scores = []
Expand Down
19 changes: 16 additions & 3 deletions examples/finetune/embedder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,33 @@ cd FlagEmbedding/scripts

```shell
python hn_mine.py \
--model_name_or_path BAAI/bge-base-en-v1.5 \
--input_file toy_finetune_data.jsonl \
--output_file toy_finetune_data_minedHN.jsonl \
--range_for_sampling 2-200 \
--negative_number 15 \
--use_gpu_for_searching
--use_gpu_for_searching \
--embedder_name_or_path BAAI/bge-base-en-v1.5
```

- **`input_file`**: json data for finetuning. This script will retrieve top-k documents for each query, and random sample negatives from the top-k documents (not including the positive documents).
- **`output_file`**: path to save JSON data with mined hard negatives for finetuning
- **`negative_number`**: the number of sampled negatives
- **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)**
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file.
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. If provided, it should be a jsonl file, each line is a dict with a key `text`. If input a candidate_pool, this script will retrieve negatives from this file.
- **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives.
- **`search_batch_size`**: batch size for searching. Default is 64.
- **`embedder_name_or_path`**: The name or path to the embedder.
- **`embedder_model_class`**: Class of the model used for embedding (current options include 'encoder-only-base', 'encoder-only-m3', 'decoder-only-base', 'decoder-only-icl'.). Default is None. For the custom model, you should set this argument.
- **`normalize_embeddings`**: Set to `True` to normalize embeddings.
- **`pooling_method`**: The pooling method for the embedder.
- **`use_fp16`**: Use FP16 precision for inference.
- **`devices`**: List of devices used for inference.
- **`query_instruction_for_retrieval`**, **`query_instruction_format_for_retrieval`**: Instructions and format for query during retrieval.
- **`examples_for_task`**, **`examples_instruction_format`**: Example tasks and their instructions format. This is only used when `embedder_model_class` is set to `decoder-only-icl`.
- **`trust_remote_code`**: Set to `True` to trust remote code execution.
- **`cache_dir`**: Cache directory for models.
- **`embedder_batch_size`**: Batch sizes for embedding and reranking.
- **`embedder_query_max_length`**, **`embedder_passage_max_length`**: Maximum length for embedding queries and passages.

### Teacher Scores

Expand Down
2 changes: 1 addition & 1 deletion scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ python hn_mine.py \
- **`output_file`**: path to save JSON data with mined hard negatives for finetuning
- **`negative_number`**: the number of sampled negatives
- **`range_for_sampling`**: where to sample negative. For example, `2-100` means sampling `negative_number` negatives from top2-top200 documents. **You can set larger value to reduce the difficulty of negatives (e.g., set it `60-300` to sample negatives from top60-300 passages)**
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. The format of this file is the same as [pretrain data](https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/pretrain#2-data-format). If input a candidate_pool, this script will retrieve negatives from this file.
- **`candidate_pool`**: The pool to retrieval. The default value is None, and this script will retrieve from the combination of all `neg` in `input_file`. If provided, it should be a jsonl file, each line is a dict with a key `text`. If input a candidate_pool, this script will retrieve negatives from this file.
- **`use_gpu_for_searching`**: whether to use faiss-gpu to retrieve negatives.
- **`search_batch_size`**: batch size for searching. Default is 64.
- **`embedder_name_or_path`**: The name or path to the embedder.
Expand Down
Loading