[Feature] Update time series (#13)
Browse files- [Feature] update time series model (d9bead416bc2e29ee7e2d4f964840a5699007664)
- [Weight] Update time series safetensors (262600bfe6a5013e9407e3f6b7be303a3dbdce42)
- [test] update time series test scripts (bbe1e5b60f58e4587d421dca5d74cea85e17e891)
- [Fix] Remove hard requirements of pandas (f2a60cf1369f53de255813688aad3557d0957391)
Co-authored-by: yehaochen <yehaochen@users.noreply.huggingface.co>
- 0092638_seism.npy +3 -0
- chat_template.jinja +2 -0
- config.json +32 -0
- configuration_interns1_pro.py +52 -2
- model-time_series-00001-of-00002.safetensors +3 -0
- model-time_series-00002-of-00002.safetensors +3 -0
- model.safetensors.index.json +2 -2
- modeling_interns1_pro.py +503 -4
- processing_interns1_pro.py +147 -2
- test_inference_ts.py +78 -0
0092638_seism.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c2b94653c6964b630038897a27cb6d276ff866d9ecd1f6419358b9407f0df62e
|
| 3 |
+
size 72128
|
chat_template.jinja
CHANGED
|
@@ -17,6 +17,8 @@
|
|
| 17 |
{{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
|
| 18 |
{%- elif 'text' in item %}
|
| 19 |
{{- item.text }}
|
|
|
|
|
|
|
| 20 |
{%- endif %}
|
| 21 |
{%- endfor %}
|
| 22 |
{%- endif %}
|
|
|
|
| 17 |
{{- 'Video ' + video_count.value|string + ': <|vision_start|><|video_pad|><|vision_end|>'-}}
|
| 18 |
{%- elif 'text' in item %}
|
| 19 |
{{- item.text }}
|
| 20 |
+
{%- elif 'time_series' in item or item.type == 'time_series' %}
|
| 21 |
+
{{- '<|ts|><TS_CONTEXT><|/ts|>'-}}
|
| 22 |
{%- endif %}
|
| 23 |
{%- endfor %}
|
| 24 |
{%- endif %}
|
config.json
CHANGED
|
@@ -58,6 +58,37 @@
|
|
| 58 |
},
|
| 59 |
"vision_end_token_id": 151653,
|
| 60 |
"vision_start_token_id": 151652,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
"auto_map": {
|
| 62 |
"AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
|
| 63 |
"AutoModel": "modeling_interns1_pro.InternS1ProModel",
|
|
@@ -141,6 +172,7 @@
|
|
| 141 |
"model.visual.blocks.17.mlp.linear_fc1",
|
| 142 |
"model.visual.blocks.4.norm2",
|
| 143 |
"model.visual.blocks.17.attn.qkv",
|
|
|
|
| 144 |
"model.language_model.layers.83.self_attn.k_norm",
|
| 145 |
"model.language_model.layers.47.post_attention_layernorm",
|
| 146 |
"model.language_model.layers.59.input_layernorm",
|
|
|
|
| 58 |
},
|
| 59 |
"vision_end_token_id": 151653,
|
| 60 |
"vision_start_token_id": 151652,
|
| 61 |
+
"ts_config": {
|
| 62 |
+
"auto_map": {
|
| 63 |
+
"AutoConfig": "configuration_interns1_pro.InternS1ProTimeSeriesConfig",
|
| 64 |
+
"AutoModel": "modeling_interns1_pro.InternS1ProTimeSeriesModel"
|
| 65 |
+
},
|
| 66 |
+
"activation_dropout": 0.0,
|
| 67 |
+
"activation_function": "gelu",
|
| 68 |
+
"architectures": [
|
| 69 |
+
"InternS1TimeSeriesModel"
|
| 70 |
+
],
|
| 71 |
+
"attention_dropout": 0.0,
|
| 72 |
+
"d_model": 768,
|
| 73 |
+
"dropout": 0.0,
|
| 74 |
+
"dtype": "bfloat16",
|
| 75 |
+
"encoder_attention_heads": 8,
|
| 76 |
+
"encoder_ffn_dim": 3072,
|
| 77 |
+
"encoder_layerdrop": 0.0,
|
| 78 |
+
"encoder_layers": 17,
|
| 79 |
+
"model_type": "interns1_pro_time_series",
|
| 80 |
+
"max_source_positions": 1500,
|
| 81 |
+
"num_mel_bins": 80,
|
| 82 |
+
"out_hidden_size": 4096,
|
| 83 |
+
"scale_embedding": false,
|
| 84 |
+
"ts_adapt_in_dim": 256,
|
| 85 |
+
"ts_adapt_out_dim": 1024,
|
| 86 |
+
"use_cache": true,
|
| 87 |
+
"attn_implementation": "eager"
|
| 88 |
+
},
|
| 89 |
+
"ts_end_id": 151684,
|
| 90 |
+
"ts_start_id": 151683,
|
| 91 |
+
"ts_token_id": 151685,
|
| 92 |
"auto_map": {
|
| 93 |
"AutoConfig": "configuration_interns1_pro.InternS1ProConfig",
|
| 94 |
"AutoModel": "modeling_interns1_pro.InternS1ProModel",
|
|
|
|
| 172 |
"model.visual.blocks.17.mlp.linear_fc1",
|
| 173 |
"model.visual.blocks.4.norm2",
|
| 174 |
"model.visual.blocks.17.attn.qkv",
|
| 175 |
+
"model.time_series",
|
| 176 |
"model.language_model.layers.83.self_attn.k_norm",
|
| 177 |
"model.language_model.layers.47.post_attention_layernorm",
|
| 178 |
"model.language_model.layers.59.input_layernorm",
|
configuration_interns1_pro.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
| 15 |
|
| 16 |
from transformers.configuration_utils import PretrainedConfig
|
| 17 |
from transformers.modeling_rope_utils import rope_config_validation
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
class InternS1ProTextConfig(PretrainedConfig):
|
|
@@ -138,20 +139,61 @@ class InternS1ProVisionConfig(PretrainedConfig):
|
|
| 138 |
self.num_position_embeddings = num_position_embeddings
|
| 139 |
self.initializer_range = initializer_range
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
class InternS1ProConfig(PretrainedConfig):
|
| 143 |
model_type = "interns1_pro"
|
| 144 |
-
sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig}
|
| 145 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 146 |
|
| 147 |
def __init__(
|
| 148 |
self,
|
| 149 |
text_config=None,
|
| 150 |
vision_config=None,
|
|
|
|
| 151 |
image_token_id=151655,
|
| 152 |
video_token_id=151656,
|
| 153 |
vision_start_token_id=151652,
|
| 154 |
vision_end_token_id=151653,
|
|
|
|
|
|
|
|
|
|
| 155 |
tie_word_embeddings=False,
|
| 156 |
**kwargs,
|
| 157 |
):
|
|
@@ -165,11 +207,19 @@ class InternS1ProConfig(PretrainedConfig):
|
|
| 165 |
elif text_config is None:
|
| 166 |
self.text_config = self.sub_configs["text_config"]()
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
self.image_token_id = image_token_id
|
| 169 |
self.video_token_id = video_token_id
|
| 170 |
self.vision_start_token_id = vision_start_token_id
|
| 171 |
self.vision_end_token_id = vision_end_token_id
|
|
|
|
|
|
|
|
|
|
| 172 |
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 173 |
|
| 174 |
|
| 175 |
-
__all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig"]
|
|
|
|
| 15 |
|
| 16 |
from transformers.configuration_utils import PretrainedConfig
|
| 17 |
from transformers.modeling_rope_utils import rope_config_validation
|
| 18 |
+
from transformers import WhisperConfig
|
| 19 |
|
| 20 |
|
| 21 |
class InternS1ProTextConfig(PretrainedConfig):
|
|
|
|
| 139 |
self.num_position_embeddings = num_position_embeddings
|
| 140 |
self.initializer_range = initializer_range
|
| 141 |
|
| 142 |
+
class InternS1ProTimeSeriesConfig(WhisperConfig):
|
| 143 |
+
|
| 144 |
+
model_type = "interns1_pro_time_series"
|
| 145 |
+
base_config_key = "ts_config"
|
| 146 |
+
|
| 147 |
+
def __init__(
|
| 148 |
+
self,
|
| 149 |
+
ts_adapt_in_dim: int=256,
|
| 150 |
+
ts_adapt_out_dim: int=1024,
|
| 151 |
+
ts_hidden_dim: int=1024,
|
| 152 |
+
ts_cnn_channels: list[int]=[1, 32, 64, 128, 128],
|
| 153 |
+
ts_cnn_kernel_sizes: list[int]=[3, 5, 5, 5],
|
| 154 |
+
ts_cnn_strides: list[int]=[2, 4, 4, 5],
|
| 155 |
+
ts_cnn_paddings: list[int]=[1, 2, 2, 2],
|
| 156 |
+
ts_concat_subsampling_in_channels: int=128,
|
| 157 |
+
ts_concat_subsampling_concat_size: int=2,
|
| 158 |
+
use_flash_attn: bool=False,
|
| 159 |
+
**kwargs
|
| 160 |
+
):
|
| 161 |
+
super().__init__(**kwargs)
|
| 162 |
+
|
| 163 |
+
self.ts_cnn_channels = ts_cnn_channels
|
| 164 |
+
self.ts_cnn_kernel_sizes = ts_cnn_kernel_sizes
|
| 165 |
+
self.ts_cnn_strides = ts_cnn_strides
|
| 166 |
+
self.ts_cnn_paddings = ts_cnn_paddings
|
| 167 |
+
self.ts_concat_subsampling_in_channels = ts_concat_subsampling_in_channels
|
| 168 |
+
self.ts_concat_subsampling_concat_size = ts_concat_subsampling_concat_size
|
| 169 |
+
|
| 170 |
+
self.ts_adapt_in_dim = ts_adapt_in_dim
|
| 171 |
+
self.ts_adapt_out_dim = ts_adapt_out_dim
|
| 172 |
+
|
| 173 |
+
self.ts_hidden_dim = ts_hidden_dim
|
| 174 |
+
self.use_flash_attn = use_flash_attn
|
| 175 |
+
|
| 176 |
+
assert self.ts_adapt_out_dim == self.ts_hidden_dim, "ts_adapt_out_dim should be equal to ts_hidden_dim"
|
| 177 |
+
assert self.ts_concat_subsampling_in_channels == self.ts_cnn_channels[-1], "ts_concat_subsampling_in_channels should be equal to the out_channel of the last cnn layer"
|
| 178 |
+
|
| 179 |
|
| 180 |
class InternS1ProConfig(PretrainedConfig):
|
| 181 |
model_type = "interns1_pro"
|
| 182 |
+
sub_configs = {"vision_config": InternS1ProVisionConfig, "text_config": InternS1ProTextConfig, 'ts_config':InternS1ProTimeSeriesConfig}
|
| 183 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 184 |
|
| 185 |
def __init__(
|
| 186 |
self,
|
| 187 |
text_config=None,
|
| 188 |
vision_config=None,
|
| 189 |
+
ts_config=None,
|
| 190 |
image_token_id=151655,
|
| 191 |
video_token_id=151656,
|
| 192 |
vision_start_token_id=151652,
|
| 193 |
vision_end_token_id=151653,
|
| 194 |
+
ts_token_id=151685,
|
| 195 |
+
ts_start_id=151683,
|
| 196 |
+
ts_end_id=151684,
|
| 197 |
tie_word_embeddings=False,
|
| 198 |
**kwargs,
|
| 199 |
):
|
|
|
|
| 207 |
elif text_config is None:
|
| 208 |
self.text_config = self.sub_configs["text_config"]()
|
| 209 |
|
| 210 |
+
if isinstance(ts_config, dict):
|
| 211 |
+
self.ts_config = self.sub_configs["ts_config"](**ts_config)
|
| 212 |
+
elif ts_config is None:
|
| 213 |
+
self.ts_config = self.sub_configs["ts_config"]()
|
| 214 |
+
|
| 215 |
self.image_token_id = image_token_id
|
| 216 |
self.video_token_id = video_token_id
|
| 217 |
self.vision_start_token_id = vision_start_token_id
|
| 218 |
self.vision_end_token_id = vision_end_token_id
|
| 219 |
+
self.ts_token_id = ts_token_id
|
| 220 |
+
self.ts_start_id = ts_start_id
|
| 221 |
+
self.ts_end_id = ts_end_id
|
| 222 |
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 223 |
|
| 224 |
|
| 225 |
+
__all__ = ["InternS1ProConfig", "InternS1ProTextConfig", "InternS1ProVisionConfig", "InternS1ProTimeSeriesConfig"]
|
model-time_series-00001-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6fab87c45c01a8695f97b5801bee2771ac6e874561ac773983397d958f1e7a00
|
| 3 |
+
size 291982664
|
model-time_series-00002-of-00002.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4150fadfb90bd9561c422b37ecc83fd5a30966f1e555bc9305b9fd5d2c914b0d
|
| 3 |
+
size 10240128
|
model.safetensors.index.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6aa1acb6e462542ccb55d50c9ba2097df081b6fd69b8ac5aaed1f0b30b14678e
|
| 3 |
+
size 32236540
|
modeling_interns1_pro.py
CHANGED
|
@@ -34,8 +34,10 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
|
|
| 34 |
from transformers.processing_utils import Unpack
|
| 35 |
from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
|
| 36 |
from transformers.utils.generic import OutputRecorder, check_model_inputs
|
| 37 |
-
from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
|
| 40 |
@use_kernel_forward_from_hub("RMSNorm")
|
| 41 |
class Qwen3VLMoeTextRMSNorm(nn.Module):
|
|
@@ -439,7 +441,7 @@ class InternS1ProPreTrainedModel(PreTrainedModel):
|
|
| 439 |
config: InternS1ProConfig
|
| 440 |
base_model_prefix = "model"
|
| 441 |
supports_gradient_checkpointing = True
|
| 442 |
-
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
|
| 443 |
_skip_keys_device_placement = ["past_key_values"]
|
| 444 |
_supports_flash_attn = True
|
| 445 |
_supports_sdpa = True
|
|
@@ -1057,6 +1059,442 @@ class InternS1ProTextModel(InternS1ProPreTrainedModel):
|
|
| 1057 |
)
|
| 1058 |
|
| 1059 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1060 |
@dataclass
|
| 1061 |
@auto_docstring(
|
| 1062 |
custom_intro="""
|
|
@@ -1118,12 +1556,13 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1118 |
# Reference: fix gemma3 grad acc #37208
|
| 1119 |
accepts_loss_kwargs = False
|
| 1120 |
config: InternS1ProConfig
|
| 1121 |
-
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock"]
|
| 1122 |
|
| 1123 |
def __init__(self, config):
|
| 1124 |
super().__init__(config)
|
| 1125 |
self.visual = InternS1ProVisionModel._from_config(config.vision_config)
|
| 1126 |
self.language_model = InternS1ProTextModel._from_config(config.text_config)
|
|
|
|
| 1127 |
|
| 1128 |
# Initialize weights and apply final processing
|
| 1129 |
self.post_init()
|
|
@@ -1170,6 +1609,15 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1170 |
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
| 1171 |
image_embeds = torch.split(image_embeds, split_sizes)
|
| 1172 |
return image_embeds
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1173 |
|
| 1174 |
def get_placeholder_mask(
|
| 1175 |
self,
|
|
@@ -1225,6 +1673,9 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1225 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1226 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 1227 |
cache_position: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
|
|
|
| 1228 |
**kwargs: Unpack[TransformersKwargs],
|
| 1229 |
) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
|
| 1230 |
r"""
|
|
@@ -1232,6 +1683,12 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1232 |
The temporal, height and width of feature shape of each image in LLM.
|
| 1233 |
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
| 1234 |
The temporal, height and width of feature shape of each video in LLM.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1235 |
"""
|
| 1236 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 1237 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
@@ -1258,6 +1715,27 @@ class InternS1ProModel(InternS1ProPreTrainedModel):
|
|
| 1258 |
)
|
| 1259 |
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
| 1260 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1261 |
if position_ids is None:
|
| 1262 |
batch_size, seq_length = inputs_embeds.shape[:2]
|
| 1263 |
if cache_position is not None:
|
|
@@ -1396,6 +1874,8 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1396 |
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
|
| 1397 |
return self.model.get_image_features(pixel_values, image_grid_thw)
|
| 1398 |
|
|
|
|
|
|
|
| 1399 |
# Make modules available through conditional class for BC
|
| 1400 |
@property
|
| 1401 |
def language_model(self):
|
|
@@ -1404,6 +1884,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1404 |
@property
|
| 1405 |
def visual(self):
|
| 1406 |
return self.model.visual
|
|
|
|
|
|
|
|
|
|
| 1407 |
|
| 1408 |
@check_model_inputs
|
| 1409 |
def forward(
|
|
@@ -1418,6 +1901,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1418 |
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
| 1419 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1420 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
|
|
|
| 1421 |
cache_position: Optional[torch.LongTensor] = None,
|
| 1422 |
logits_to_keep: Union[int, torch.Tensor] = 0,
|
| 1423 |
**kwargs: Unpack[TransformersKwargs],
|
|
@@ -1484,6 +1970,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1484 |
past_key_values=past_key_values,
|
| 1485 |
inputs_embeds=inputs_embeds,
|
| 1486 |
cache_position=cache_position,
|
|
|
|
|
|
|
|
|
|
| 1487 |
**kwargs,
|
| 1488 |
)
|
| 1489 |
|
|
@@ -1530,6 +2019,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1530 |
pixel_values_videos=None,
|
| 1531 |
image_grid_thw=None,
|
| 1532 |
video_grid_thw=None,
|
|
|
|
|
|
|
|
|
|
| 1533 |
**kwargs,
|
| 1534 |
):
|
| 1535 |
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
@@ -1546,6 +2038,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1546 |
image_grid_thw=image_grid_thw,
|
| 1547 |
video_grid_thw=video_grid_thw,
|
| 1548 |
use_cache=use_cache,
|
|
|
|
|
|
|
|
|
|
| 1549 |
**kwargs,
|
| 1550 |
)
|
| 1551 |
|
|
@@ -1554,6 +2049,9 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1554 |
if cache_position[0] != 0:
|
| 1555 |
model_inputs["pixel_values"] = None
|
| 1556 |
model_inputs["pixel_values_videos"] = None
|
|
|
|
|
|
|
|
|
|
| 1557 |
|
| 1558 |
return model_inputs
|
| 1559 |
|
|
@@ -1697,6 +2195,7 @@ class InternS1ProForConditionalGeneration(InternS1ProPreTrainedModel, Generation
|
|
| 1697 |
|
| 1698 |
__all__ = [
|
| 1699 |
"InternS1ProVisionModel",
|
|
|
|
| 1700 |
"InternS1ProForConditionalGeneration",
|
| 1701 |
"InternS1ProModel",
|
| 1702 |
"InternS1ProPreTrainedModel",
|
|
|
|
| 34 |
from transformers.processing_utils import Unpack
|
| 35 |
from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
|
| 36 |
from transformers.utils.generic import OutputRecorder, check_model_inputs
|
| 37 |
+
from .configuration_interns1_pro import InternS1ProConfig, InternS1ProTextConfig, InternS1ProVisionConfig, InternS1ProTimeSeriesConfig
|
| 38 |
+
from transformers.models.whisper.modeling_whisper import WhisperEncoderLayer
|
| 39 |
+
from transformers import WhisperPreTrainedModel
|
| 40 |
+
import math
|
| 41 |
|
| 42 |
@use_kernel_forward_from_hub("RMSNorm")
|
| 43 |
class Qwen3VLMoeTextRMSNorm(nn.Module):
|
|
|
|
| 441 |
config: InternS1ProConfig
|
| 442 |
base_model_prefix = "model"
|
| 443 |
supports_gradient_checkpointing = True
|
| 444 |
+
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
|
| 445 |
_skip_keys_device_placement = ["past_key_values"]
|
| 446 |
_supports_flash_attn = True
|
| 447 |
_supports_sdpa = True
|
|
|
|
| 1059 |
)
|
| 1060 |
|
| 1061 |
|
| 1062 |
+
class InternS1ProTimeSeriesEncoder(WhisperPreTrainedModel):
|
| 1063 |
+
def __init__(self, config: InternS1ProTimeSeriesConfig):
|
| 1064 |
+
super().__init__(config)
|
| 1065 |
+
self.config = config
|
| 1066 |
+
self.dropout = config.dropout
|
| 1067 |
+
self.layerdrop = config.encoder_layerdrop
|
| 1068 |
+
|
| 1069 |
+
self.embed_dim = config.d_model
|
| 1070 |
+
self.num_mel_bins = config.num_mel_bins
|
| 1071 |
+
self.padding_idx = config.pad_token_id
|
| 1072 |
+
self.max_source_positions = config.max_source_positions
|
| 1073 |
+
self.embed_scale = math.sqrt(self.embed_dim) if config.scale_embedding else 1.0
|
| 1074 |
+
|
| 1075 |
+
self.conv1 = nn.Conv1d(self.num_mel_bins, self.embed_dim, kernel_size=3, padding=1)
|
| 1076 |
+
self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=3, stride=2, padding=1)
|
| 1077 |
+
self.embed_positions = nn.Embedding(self.max_source_positions, self.embed_dim)
|
| 1078 |
+
|
| 1079 |
+
self.layers = nn.ModuleList([WhisperEncoderLayer(config) for _ in range(config.encoder_layers)])
|
| 1080 |
+
self.layer_norm = nn.LayerNorm(config.d_model)
|
| 1081 |
+
|
| 1082 |
+
self.gradient_checkpointing = False
|
| 1083 |
+
self.post_init()
|
| 1084 |
+
|
| 1085 |
+
self.mask_type = None
|
| 1086 |
+
self.chunk_length = None
|
| 1087 |
+
|
| 1088 |
+
self.adapt_in = nn.Linear(config.ts_adapt_in_dim, 80)
|
| 1089 |
+
self.adapt_out = nn.Linear(self.embed_dim, config.ts_adapt_out_dim)
|
| 1090 |
+
|
| 1091 |
+
def _freeze_parameters(self):
|
| 1092 |
+
for param in self.parameters():
|
| 1093 |
+
param.requires_grad = False
|
| 1094 |
+
self._requires_grad = False
|
| 1095 |
+
|
| 1096 |
+
def get_input_embeddings(self) -> nn.Module:
|
| 1097 |
+
return self.conv1
|
| 1098 |
+
|
| 1099 |
+
def set_input_embeddings(self, value: nn.Module):
|
| 1100 |
+
self.conv1 = value
|
| 1101 |
+
|
| 1102 |
+
def define_masktype(self, masktype, chunk_length=None):
|
| 1103 |
+
self.mask_type = masktype
|
| 1104 |
+
self.chunk_length = chunk_length
|
| 1105 |
+
|
| 1106 |
+
def _make_causal_mask(self,
|
| 1107 |
+
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
| 1108 |
+
):
|
| 1109 |
+
"""
|
| 1110 |
+
Make causal mask used for bi-directional self-attention.
|
| 1111 |
+
"""
|
| 1112 |
+
bsz, tgt_len = input_ids_shape
|
| 1113 |
+
mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
|
| 1114 |
+
mask_cond = torch.arange(mask.size(-1), device=device)
|
| 1115 |
+
mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
|
| 1116 |
+
mask = mask.to(dtype)
|
| 1117 |
+
|
| 1118 |
+
if past_key_values_length > 0:
|
| 1119 |
+
mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
|
| 1120 |
+
return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
|
| 1121 |
+
|
| 1122 |
+
# Copied from transformers.models.bart.modeling_bart._expand_mask
|
| 1123 |
+
def _expand_mask(self, mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
| 1124 |
+
"""
|
| 1125 |
+
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
| 1126 |
+
"""
|
| 1127 |
+
# print(mask.size())
|
| 1128 |
+
bsz, src_len = mask.size()
|
| 1129 |
+
tgt_len = tgt_len if tgt_len is not None else src_len
|
| 1130 |
+
|
| 1131 |
+
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
| 1132 |
+
|
| 1133 |
+
inverted_mask = 1.0 - expanded_mask
|
| 1134 |
+
|
| 1135 |
+
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
| 1136 |
+
|
| 1137 |
+
|
| 1138 |
+
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
|
| 1139 |
+
# create causal mask
|
| 1140 |
+
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
| 1141 |
+
combined_attention_mask = None
|
| 1142 |
+
|
| 1143 |
+
if input_shape[-1] > 1:
|
| 1144 |
+
combined_attention_mask = self._make_causal_mask(
|
| 1145 |
+
input_shape,
|
| 1146 |
+
inputs_embeds.dtype,
|
| 1147 |
+
device=inputs_embeds.device,
|
| 1148 |
+
past_key_values_length=past_key_values_length,
|
| 1149 |
+
)
|
| 1150 |
+
|
| 1151 |
+
if attention_mask is not None:
|
| 1152 |
+
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
| 1153 |
+
expanded_attn_mask = self._expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
|
| 1154 |
+
combined_attention_mask = (
|
| 1155 |
+
expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
|
| 1156 |
+
)
|
| 1157 |
+
return combined_attention_mask
|
| 1158 |
+
|
| 1159 |
+
def prepare_chunk_attention_mask(self, attention_mask, input_shape, inputs_embeds):
|
| 1160 |
+
|
| 1161 |
+
block_size = round(self.chunk_length / 4 * 2)
|
| 1162 |
+
matrix_size = input_shape[1]
|
| 1163 |
+
|
| 1164 |
+
matrix = torch.ones(matrix_size, matrix_size)
|
| 1165 |
+
|
| 1166 |
+
num_full_blocks = round(matrix_size // block_size)
|
| 1167 |
+
remainder = matrix_size % block_size
|
| 1168 |
+
for i in range(num_full_blocks):
|
| 1169 |
+
row_start = i * block_size
|
| 1170 |
+
col_start = i * block_size
|
| 1171 |
+
matrix[row_start:row_start + block_size, col_start:col_start + block_size] = torch.zeros(block_size, block_size)
|
| 1172 |
+
|
| 1173 |
+
if remainder > 0:
|
| 1174 |
+
last_row_start = num_full_blocks * block_size
|
| 1175 |
+
last_col_start = num_full_blocks * block_size
|
| 1176 |
+
matrix[last_row_start:last_row_start + remainder, last_col_start:last_col_start + remainder] = torch.zeros(remainder, remainder)
|
| 1177 |
+
|
| 1178 |
+
matrix = matrix * -65504
|
| 1179 |
+
matrix = matrix.unsqueeze(0).unsqueeze(0).repeat(input_shape[0], 1, 1, 1)
|
| 1180 |
+
attention_mask = matrix.to(inputs_embeds.device)
|
| 1181 |
+
return attention_mask
|
| 1182 |
+
|
| 1183 |
+
def forward(
|
| 1184 |
+
self,
|
| 1185 |
+
input_features,
|
| 1186 |
+
attention_mask=None,
|
| 1187 |
+
head_mask=None,
|
| 1188 |
+
output_attentions=None,
|
| 1189 |
+
output_hidden_states=None,
|
| 1190 |
+
return_dict=None,
|
| 1191 |
+
):
|
| 1192 |
+
# (N, T, C) -> (T, N, C) -> (N, C, T)
|
| 1193 |
+
input_features = input_features.permute(1, 0, 2)
|
| 1194 |
+
input_features = self.adapt_in(input_features)
|
| 1195 |
+
input_features = input_features.permute(1, 2, 0)
|
| 1196 |
+
|
| 1197 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 1198 |
+
output_hidden_states = (
|
| 1199 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1200 |
+
)
|
| 1201 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1202 |
+
|
| 1203 |
+
# (N, C, T) -> (N, C, T//2)
|
| 1204 |
+
inputs_embeds = nn.functional.gelu(self.conv1(input_features))
|
| 1205 |
+
inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
|
| 1206 |
+
|
| 1207 |
+
# (N, C, T) -> (N, T, C)
|
| 1208 |
+
inputs_embeds = inputs_embeds.permute(0, 2, 1) # torch.Size([1, 100, 768])
|
| 1209 |
+
embed_pos = self.embed_positions.weight # torch.Size([1500, 768])
|
| 1210 |
+
|
| 1211 |
+
if inputs_embeds.shape[1] > embed_pos.shape[0]:
|
| 1212 |
+
target_len = inputs_embeds.shape[1]
|
| 1213 |
+
padding = [0, 0, 0, target_len-embed_pos.shape[0]]
|
| 1214 |
+
|
| 1215 |
+
embed_pos = nn.functional.pad(embed_pos, pad=padding, mode='constant', value=0)
|
| 1216 |
+
hidden_states = inputs_embeds[:, :embed_pos.shape[0], :] + embed_pos
|
| 1217 |
+
else:
|
| 1218 |
+
hidden_states = inputs_embeds + embed_pos[:inputs_embeds.shape[1], :]
|
| 1219 |
+
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
|
| 1220 |
+
|
| 1221 |
+
encoder_states = () if output_hidden_states else None
|
| 1222 |
+
all_attentions = () if output_attentions else None
|
| 1223 |
+
|
| 1224 |
+
input_shape = inputs_embeds.size()[:-1]
|
| 1225 |
+
past_key_values_length = 0
|
| 1226 |
+
attention_mask = None
|
| 1227 |
+
if self.mask_type == 'chunk':
|
| 1228 |
+
attention_mask = self.prepare_chunk_attention_mask(attention_mask, input_shape, inputs_embeds)
|
| 1229 |
+
else:
|
| 1230 |
+
attention_mask = self._prepare_decoder_attention_mask(
|
| 1231 |
+
attention_mask, input_shape, inputs_embeds, past_key_values_length
|
| 1232 |
+
)
|
| 1233 |
+
|
| 1234 |
+
if head_mask is not None:
|
| 1235 |
+
assert head_mask.size()[0] == (
|
| 1236 |
+
len(self.layers)
|
| 1237 |
+
), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
|
| 1238 |
+
|
| 1239 |
+
for idx, encoder_layer in enumerate(self.layers):
|
| 1240 |
+
if output_hidden_states:
|
| 1241 |
+
encoder_states = encoder_states + (self.layer_norm(hidden_states),)
|
| 1242 |
+
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
|
| 1243 |
+
to_drop = False
|
| 1244 |
+
if self.training:
|
| 1245 |
+
dropout_probability = torch.rand([])
|
| 1246 |
+
if dropout_probability < self.layerdrop: # skip the layer
|
| 1247 |
+
to_drop = True
|
| 1248 |
+
|
| 1249 |
+
if to_drop:
|
| 1250 |
+
layer_outputs = (None, None)
|
| 1251 |
+
else:
|
| 1252 |
+
if self.gradient_checkpointing and self.training:
|
| 1253 |
+
|
| 1254 |
+
def create_custom_forward(module):
|
| 1255 |
+
def custom_forward(*inputs):
|
| 1256 |
+
return module(*inputs, output_attentions)
|
| 1257 |
+
|
| 1258 |
+
return custom_forward
|
| 1259 |
+
|
| 1260 |
+
layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 1261 |
+
create_custom_forward(encoder_layer),
|
| 1262 |
+
hidden_states,
|
| 1263 |
+
attention_mask,
|
| 1264 |
+
(head_mask[idx] if head_mask is not None else None),
|
| 1265 |
+
)
|
| 1266 |
+
else:
|
| 1267 |
+
layer_outputs = encoder_layer(
|
| 1268 |
+
hidden_states,
|
| 1269 |
+
attention_mask,
|
| 1270 |
+
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
|
| 1271 |
+
output_attentions=output_attentions,
|
| 1272 |
+
)
|
| 1273 |
+
|
| 1274 |
+
hidden_states = layer_outputs[0]
|
| 1275 |
+
|
| 1276 |
+
if output_attentions:
|
| 1277 |
+
all_attentions = all_attentions + (layer_outputs[1],)
|
| 1278 |
+
|
| 1279 |
+
# (N, T, C) -> (T, N, C)
|
| 1280 |
+
hidden_states = hidden_states.permute(1, 0, 2)
|
| 1281 |
+
hidden_states = self.layer_norm(hidden_states)
|
| 1282 |
+
hidden_states = self.adapt_out(hidden_states)
|
| 1283 |
+
|
| 1284 |
+
# (T, N, C) -> (N, T, C)
|
| 1285 |
+
hidden_states = hidden_states.permute(1, 0, 2)
|
| 1286 |
+
if output_hidden_states:
|
| 1287 |
+
encoder_states = encoder_states + (hidden_states,)
|
| 1288 |
+
|
| 1289 |
+
if not return_dict:
|
| 1290 |
+
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
|
| 1291 |
+
return ModelOutput(
|
| 1292 |
+
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
|
| 1293 |
+
)
|
| 1294 |
+
|
| 1295 |
+
|
| 1296 |
+
class InternS1ProTimeSeriesConcatSubsampling(nn.Module):
|
| 1297 |
+
def __init__(self, in_channels: int, concat_size: int):
|
| 1298 |
+
super().__init__()
|
| 1299 |
+
self.in_channels = in_channels
|
| 1300 |
+
self.out_channels = in_channels * concat_size
|
| 1301 |
+
|
| 1302 |
+
def forward(self, ts_signals: torch.Tensor, ts_lens: torch.Tensor):
|
| 1303 |
+
if ts_signals.shape[1] % 2 != 0:
|
| 1304 |
+
ts_signals = ts_signals[:, :-1, :]
|
| 1305 |
+
even_frames = ts_signals[:, ::2, :]
|
| 1306 |
+
odd_frames = ts_signals[:, 1::2, :]
|
| 1307 |
+
ts_signals = torch.cat((even_frames, odd_frames), dim=2)
|
| 1308 |
+
ts_lens = ts_lens // 2
|
| 1309 |
+
return ts_signals, ts_lens
|
| 1310 |
+
|
| 1311 |
+
|
| 1312 |
+
class InternS1ProTimeSeriesFixPositionalEncoding(nn.Module):
|
| 1313 |
+
def __init__(self, d_model, max_len=20000):
|
| 1314 |
+
super().__init__()
|
| 1315 |
+
pe = torch.zeros(max_len, d_model,dtype=torch.float)
|
| 1316 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
| 1317 |
+
div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-math.log(10000.0) / d_model))
|
| 1318 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
| 1319 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
| 1320 |
+
pe = pe.unsqueeze(0).transpose(0, 1).to(torch.float32) # (max_len, 1, d_model)
|
| 1321 |
+
self.register_buffer('pe', pe, persistent=True)
|
| 1322 |
+
|
| 1323 |
+
def forward(self, x):
|
| 1324 |
+
# x: (seq_len, batch_size, d_model)
|
| 1325 |
+
x = x + self.pe[:x.size(0), :]
|
| 1326 |
+
return x.clone()
|
| 1327 |
+
|
| 1328 |
+
|
| 1329 |
+
class InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling(nn.Module):
|
| 1330 |
+
def __init__(self, hidden_dim=128, nhead=8,num_encoder_layers = 1):
|
| 1331 |
+
super().__init__()
|
| 1332 |
+
self.conv = nn.Conv1d(in_channels=1, out_channels=hidden_dim, kernel_size=5, stride=1, padding=2)
|
| 1333 |
+
encoder_layers = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead)
|
| 1334 |
+
self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
|
| 1335 |
+
self.pos_encoder = InternS1ProTimeSeriesFixPositionalEncoding(d_model=hidden_dim)
|
| 1336 |
+
self.subsampling = InternS1ProTimeSeriesConcatSubsampling(128, 2)
|
| 1337 |
+
|
| 1338 |
+
|
| 1339 |
+
def forward(self, inputs, input_lens, sr):
|
| 1340 |
+
features, feature_lens = self.forward_patch(inputs, input_lens, sr)
|
| 1341 |
+
outputs = features
|
| 1342 |
+
output_lens = feature_lens
|
| 1343 |
+
return outputs, output_lens
|
| 1344 |
+
|
| 1345 |
+
def forward_patch(self, inputs, input_lens, sr):
|
| 1346 |
+
sr = sr.float()
|
| 1347 |
+
strides = torch.floor(160/((1+torch.exp(-sr/100))**6))
|
| 1348 |
+
patch_sizes = strides * 2
|
| 1349 |
+
patched_outputs = []
|
| 1350 |
+
output_lens = []
|
| 1351 |
+
|
| 1352 |
+
for i in range(len(inputs)):
|
| 1353 |
+
seq = inputs[i] # [seq_len, num_channel]
|
| 1354 |
+
ps = patch_sizes[i].item()
|
| 1355 |
+
st = strides[i].item()
|
| 1356 |
+
le = input_lens[i]
|
| 1357 |
+
|
| 1358 |
+
output_len = torch.ceil((le - ps) / st) + 1
|
| 1359 |
+
pad_len = ((output_len - 1) * st + ps - le).long().item()
|
| 1360 |
+
if seq.ndim == 1:
|
| 1361 |
+
seq = seq.unsqueeze(-1)
|
| 1362 |
+
seq = nn.functional.pad(seq, (0, 0, 0, pad_len), "constant", 0)
|
| 1363 |
+
assert output_len > 0,(seq.shape, ps,st,le,output_len)
|
| 1364 |
+
output_lens.append(output_len)
|
| 1365 |
+
indices = (torch.arange(0, output_len * st, st).unsqueeze(1) + torch.arange(ps)).long()
|
| 1366 |
+
patched = seq[indices]
|
| 1367 |
+
|
| 1368 |
+
output = self.forward_encoder(patched) #[num_patch, D]
|
| 1369 |
+
patched_outputs.append(output)
|
| 1370 |
+
|
| 1371 |
+
outputs = nn.utils.rnn.pad_sequence(patched_outputs, batch_first=True)
|
| 1372 |
+
output_lens = torch.tensor(output_lens).squeeze().to(outputs.device).long()
|
| 1373 |
+
if output_lens.ndim == 0:
|
| 1374 |
+
output_lens = output_lens.unsqueeze(0)
|
| 1375 |
+
|
| 1376 |
+
outputs, output_lens = self.subsampling(outputs.clone(), output_lens.clone())
|
| 1377 |
+
return outputs, output_lens
|
| 1378 |
+
|
| 1379 |
+
def forward_encoder(self, x):
|
| 1380 |
+
num_patch, patch_len, C = x.shape
|
| 1381 |
+
# conv1
|
| 1382 |
+
x = x.reshape(num_patch*C, 1, patch_len) # 每个 channel 当作独立样本送入 conv1
|
| 1383 |
+
x = nn.functional.relu((self.conv(x))) # [B*C, D1, L]
|
| 1384 |
+
x = x.permute(2,0,1) # [L, B*C, D1]
|
| 1385 |
+
|
| 1386 |
+
x = self.pos_encoder(x) # [L, B*C, D1]
|
| 1387 |
+
x = self.transformer_encoder(x.to(torch.bfloat16))
|
| 1388 |
+
x = x.mean(0)
|
| 1389 |
+
|
| 1390 |
+
x = x.reshape(num_patch,C,-1)
|
| 1391 |
+
|
| 1392 |
+
return x.mean(1)
|
| 1393 |
+
|
| 1394 |
+
class InternS1ProTimeSeriesProjector(nn.Module):
|
| 1395 |
+
def __init__(self, config: InternS1ProTimeSeriesConfig):
|
| 1396 |
+
super().__init__()
|
| 1397 |
+
self.layer_norm = nn.LayerNorm(config.ts_hidden_dim)
|
| 1398 |
+
self.linear_1 = nn.Linear(config.ts_hidden_dim, config.out_hidden_size)
|
| 1399 |
+
self.act = ACT2FN[config.activation_function]
|
| 1400 |
+
self.linear_2 = nn.Linear(config.out_hidden_size, config.out_hidden_size)
|
| 1401 |
+
|
| 1402 |
+
def forward(self, ts_features):
|
| 1403 |
+
hidden_states = self.layer_norm(ts_features)
|
| 1404 |
+
hidden_states = self.linear_1(hidden_states)
|
| 1405 |
+
hidden_states = self.act(hidden_states)
|
| 1406 |
+
hidden_states = self.linear_2(hidden_states)
|
| 1407 |
+
return hidden_states
|
| 1408 |
+
|
| 1409 |
+
class InternS1ProTimeSeriesModel(InternS1ProPreTrainedModel):
|
| 1410 |
+
main_input_name = 'time_series_signals'
|
| 1411 |
+
_supports_flash_attn_2 = False
|
| 1412 |
+
config_class = InternS1ProTimeSeriesConfig
|
| 1413 |
+
_no_split_modules = ['WhisperEncoderLayer']
|
| 1414 |
+
|
| 1415 |
+
def __init__(self, config: InternS1ProTimeSeriesConfig):
|
| 1416 |
+
super().__init__(config)
|
| 1417 |
+
self.config = config
|
| 1418 |
+
self.encoder_embed = InternS1ProTimeSeriesMultiChannelAdaptiveSubsampling()
|
| 1419 |
+
self.encoder = InternS1ProTimeSeriesEncoder(config)
|
| 1420 |
+
self.projector = InternS1ProTimeSeriesProjector(config)
|
| 1421 |
+
|
| 1422 |
+
def get_input_embeddings(self):
|
| 1423 |
+
return self.encoder_embed
|
| 1424 |
+
|
| 1425 |
+
def make_pad_mask(self, lengths: torch.Tensor) -> torch.Tensor:
|
| 1426 |
+
"""
|
| 1427 |
+
Args:
|
| 1428 |
+
lengths:
|
| 1429 |
+
A 1-D tensor containing sentence lengths.
|
| 1430 |
+
max_len:
|
| 1431 |
+
The length of masks.
|
| 1432 |
+
Returns:
|
| 1433 |
+
Return a 2-D bool tensor, where masked positions
|
| 1434 |
+
are filled with `True` and non-masked positions are
|
| 1435 |
+
filled with `False`.
|
| 1436 |
+
|
| 1437 |
+
>>> lengths = torch.tensor([1, 3, 2, 5])
|
| 1438 |
+
>>> make_pad_mask(lengths)
|
| 1439 |
+
tensor([[False, True, True, True, True],
|
| 1440 |
+
[False, False, False, True, True],
|
| 1441 |
+
[False, False, True, True, True],
|
| 1442 |
+
[False, False, False, False, False]])
|
| 1443 |
+
"""
|
| 1444 |
+
assert lengths.ndim == 1, lengths.ndim
|
| 1445 |
+
max_len = lengths.max()
|
| 1446 |
+
n = lengths.size(0)
|
| 1447 |
+
seq_range = torch.arange(0, max_len, device=lengths.device)
|
| 1448 |
+
expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
|
| 1449 |
+
return expaned_lengths >= lengths.unsqueeze(-1)
|
| 1450 |
+
|
| 1451 |
+
def forward(
|
| 1452 |
+
self,
|
| 1453 |
+
time_series_signals: Optional[torch.FloatTensor] = None,
|
| 1454 |
+
ts_lens: Optional[torch.Tensor] = None,
|
| 1455 |
+
sr: Optional[torch.Tensor] = None,
|
| 1456 |
+
output_hidden_states: Optional[bool] = None,
|
| 1457 |
+
return_dict: Optional[bool] = None,
|
| 1458 |
+
time_series_embeds: Optional[torch.FloatTensor] = None,
|
| 1459 |
+
):
|
| 1460 |
+
|
| 1461 |
+
output_hidden_states = (
|
| 1462 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1463 |
+
)
|
| 1464 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1465 |
+
|
| 1466 |
+
if time_series_signals is None and time_series_embeds is None:
|
| 1467 |
+
raise ValueError('You have to specify time_series_signals or time_series_embeds')
|
| 1468 |
+
|
| 1469 |
+
if time_series_embeds is not None and len(time_series_embeds.shape) == 3 and time_series_embeds.shape[-1] == self.config.ts_adapt_in_dim:
|
| 1470 |
+
time_series_embeds = time_series_embeds
|
| 1471 |
+
else:
|
| 1472 |
+
if (isinstance(time_series_signals,list) and len(time_series_signals[0].shape) == 2) \
|
| 1473 |
+
or (isinstance(time_series_signals, torch.Tensor) and len(time_series_signals.shape) == 3):
|
| 1474 |
+
time_series_embeds, ts_lens = self.encoder_embed(time_series_signals, ts_lens, sr)
|
| 1475 |
+
else:
|
| 1476 |
+
raise ValueError(f'wrong time_series_signals size: {time_series_signals[0].shape}')
|
| 1477 |
+
|
| 1478 |
+
# [B, 64000, 1] -> [B, 200, 256] -> [B, 100, 1024]
|
| 1479 |
+
encoder_outputs = self.encoder(
|
| 1480 |
+
input_features=time_series_embeds,
|
| 1481 |
+
output_hidden_states=output_hidden_states,
|
| 1482 |
+
return_dict=return_dict,
|
| 1483 |
+
)
|
| 1484 |
+
|
| 1485 |
+
# ts_lens after encoder
|
| 1486 |
+
ts_lens = (ts_lens+1) // 2
|
| 1487 |
+
assert torch.all(ts_lens > 0), f"The length of time_series_embeds is so small. ts_lens: {ts_lens}"
|
| 1488 |
+
|
| 1489 |
+
src_key_padding_mask = self.make_pad_mask(ts_lens)
|
| 1490 |
+
last_hidden_state = encoder_outputs.last_hidden_state
|
| 1491 |
+
|
| 1492 |
+
ts_pad_mask = src_key_padding_mask
|
| 1493 |
+
ts_embeds = self.projector(last_hidden_state)
|
| 1494 |
+
|
| 1495 |
+
return ts_embeds,ts_pad_mask
|
| 1496 |
+
|
| 1497 |
+
|
| 1498 |
@dataclass
|
| 1499 |
@auto_docstring(
|
| 1500 |
custom_intro="""
|
|
|
|
| 1556 |
# Reference: fix gemma3 grad acc #37208
|
| 1557 |
accepts_loss_kwargs = False
|
| 1558 |
config: InternS1ProConfig
|
| 1559 |
+
_no_split_modules = ["InternS1ProMoeTextDecoderLayer", "Qwen3VLMoeVisionBlock",'WhisperEncoderLayer']
|
| 1560 |
|
| 1561 |
def __init__(self, config):
|
| 1562 |
super().__init__(config)
|
| 1563 |
self.visual = InternS1ProVisionModel._from_config(config.vision_config)
|
| 1564 |
self.language_model = InternS1ProTextModel._from_config(config.text_config)
|
| 1565 |
+
self.time_series = InternS1ProTimeSeriesModel._from_config(config.ts_config)
|
| 1566 |
|
| 1567 |
# Initialize weights and apply final processing
|
| 1568 |
self.post_init()
|
|
|
|
| 1609 |
split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
|
| 1610 |
image_embeds = torch.split(image_embeds, split_sizes)
|
| 1611 |
return image_embeds
|
| 1612 |
+
|
| 1613 |
+
def get_ts_feature(self, ts_values, ts_lens, sr):
|
| 1614 |
+
ts_embeds, ts_pad_mask = self.time_series(
|
| 1615 |
+
time_series_signals=ts_values,
|
| 1616 |
+
ts_lens=ts_lens,
|
| 1617 |
+
sr=sr,
|
| 1618 |
+
output_hidden_states=False,
|
| 1619 |
+
return_dict=True)
|
| 1620 |
+
return ts_embeds, ts_pad_mask
|
| 1621 |
|
| 1622 |
def get_placeholder_mask(
|
| 1623 |
self,
|
|
|
|
| 1673 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1674 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 1675 |
cache_position: Optional[torch.LongTensor] = None,
|
| 1676 |
+
ts_values: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
|
| 1677 |
+
ts_lens: Union[torch.Tensor, list[torch.Tensor]] = None,
|
| 1678 |
+
ts_sr: Union[torch.FloatTensor, list[torch.FloatTensor]] = None,
|
| 1679 |
**kwargs: Unpack[TransformersKwargs],
|
| 1680 |
) -> Union[tuple, Qwen3VLMoeModelOutputWithPast]:
|
| 1681 |
r"""
|
|
|
|
| 1683 |
The temporal, height and width of feature shape of each image in LLM.
|
| 1684 |
video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
|
| 1685 |
The temporal, height and width of feature shape of each video in LLM.
|
| 1686 |
+
ts_values (`torch.FloatTensor` of shape `(batch_size, seq_len, num_channels)`, *optional*):
|
| 1687 |
+
The tensors corresponding to the input time series signals.
|
| 1688 |
+
ts_lens (`torch.Tensor` of shape `(batch_size,)`, *optional*):
|
| 1689 |
+
The valid lengths of each time series signal in the batch.
|
| 1690 |
+
ts_sr (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
|
| 1691 |
+
The sampling rates of each time series signal in the batch.
|
| 1692 |
"""
|
| 1693 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 1694 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
|
|
|
| 1715 |
)
|
| 1716 |
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
|
| 1717 |
|
| 1718 |
+
if pixel_values is None and pixel_values_videos is None and ts_values is not None:
|
| 1719 |
+
ts_features, ts_pad_mask = self.get_ts_feature(ts_values, ts_lens, ts_sr) # [B, T, C], [B, T]
|
| 1720 |
+
ts_features = ts_features[~ts_pad_mask].to(inputs_embeds.device, inputs_embeds.dtype) # [num_valid_ts_tokens, C]
|
| 1721 |
+
B, N, C = inputs_embeds.shape
|
| 1722 |
+
input_ids = input_ids.reshape(B * N)
|
| 1723 |
+
inputs_embeds = inputs_embeds.reshape(B * N, C)
|
| 1724 |
+
# replace ts_token in inputs_embeds and attention_mask
|
| 1725 |
+
ts_placeholder = (input_ids == self.config.ts_token_id)
|
| 1726 |
+
n_ts_placeholders = ts_placeholder.sum().item()
|
| 1727 |
+
n_ts_tokens = ts_features.size(0)
|
| 1728 |
+
assert n_ts_placeholders == n_ts_tokens, f"[ERROR]: Mismatch: <TS_CONTEXT> tokens={n_ts_placeholders}, ts_embeds_valid={n_ts_tokens}"
|
| 1729 |
+
|
| 1730 |
+
try:
|
| 1731 |
+
inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + ts_features
|
| 1732 |
+
except Exception as e:
|
| 1733 |
+
print(f'warning: {e}, inputs_embeds[selected].shape={inputs_embeds[ts_placeholder].shape}, ts_embeds_valid.shape={ts_features.shape}')
|
| 1734 |
+
inputs_embeds[ts_placeholder] = inputs_embeds[ts_placeholder] * 0.0 + n_ts_tokens[:n_ts_placeholders]
|
| 1735 |
+
|
| 1736 |
+
inputs_embeds = inputs_embeds.reshape(B, N, C)
|
| 1737 |
+
# input_ids = input_ids.reshape(B, N)
|
| 1738 |
+
|
| 1739 |
if position_ids is None:
|
| 1740 |
batch_size, seq_length = inputs_embeds.shape[:2]
|
| 1741 |
if cache_position is not None:
|
|
|
|
| 1874 |
def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
|
| 1875 |
return self.model.get_image_features(pixel_values, image_grid_thw)
|
| 1876 |
|
| 1877 |
+
def get_ts_feature(self, ts_values, ts_lens, sr):
|
| 1878 |
+
return self.model.get_ts_feature(ts_values, ts_lens, sr)
|
| 1879 |
# Make modules available through conditional class for BC
|
| 1880 |
@property
|
| 1881 |
def language_model(self):
|
|
|
|
| 1884 |
@property
|
| 1885 |
def visual(self):
|
| 1886 |
return self.model.visual
|
| 1887 |
+
|
| 1888 |
+
def time_series(self):
|
| 1889 |
+
return self.model.time_series
|
| 1890 |
|
| 1891 |
@check_model_inputs
|
| 1892 |
def forward(
|
|
|
|
| 1901 |
pixel_values_videos: Optional[torch.FloatTensor] = None,
|
| 1902 |
image_grid_thw: Optional[torch.LongTensor] = None,
|
| 1903 |
video_grid_thw: Optional[torch.LongTensor] = None,
|
| 1904 |
+
ts_values: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
|
| 1905 |
+
ts_lens: Optional[Union[torch.Tensor, list[torch.Tensor]]] = None,
|
| 1906 |
+
ts_sr: Optional[Union[torch.FloatTensor, list[torch.FloatTensor]]] = None,
|
| 1907 |
cache_position: Optional[torch.LongTensor] = None,
|
| 1908 |
logits_to_keep: Union[int, torch.Tensor] = 0,
|
| 1909 |
**kwargs: Unpack[TransformersKwargs],
|
|
|
|
| 1970 |
past_key_values=past_key_values,
|
| 1971 |
inputs_embeds=inputs_embeds,
|
| 1972 |
cache_position=cache_position,
|
| 1973 |
+
ts_values=ts_values,
|
| 1974 |
+
ts_lens=ts_lens,
|
| 1975 |
+
ts_sr=ts_sr,
|
| 1976 |
**kwargs,
|
| 1977 |
)
|
| 1978 |
|
|
|
|
| 2019 |
pixel_values_videos=None,
|
| 2020 |
image_grid_thw=None,
|
| 2021 |
video_grid_thw=None,
|
| 2022 |
+
ts_values=None,
|
| 2023 |
+
ts_lens=None,
|
| 2024 |
+
ts_sr=None,
|
| 2025 |
**kwargs,
|
| 2026 |
):
|
| 2027 |
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
|
|
|
|
| 2038 |
image_grid_thw=image_grid_thw,
|
| 2039 |
video_grid_thw=video_grid_thw,
|
| 2040 |
use_cache=use_cache,
|
| 2041 |
+
ts_values=ts_values,
|
| 2042 |
+
ts_lens=ts_lens,
|
| 2043 |
+
ts_sr=ts_sr,
|
| 2044 |
**kwargs,
|
| 2045 |
)
|
| 2046 |
|
|
|
|
| 2049 |
if cache_position[0] != 0:
|
| 2050 |
model_inputs["pixel_values"] = None
|
| 2051 |
model_inputs["pixel_values_videos"] = None
|
| 2052 |
+
model_inputs["ts_values"] = None
|
| 2053 |
+
model_inputs["ts_lens"] = None
|
| 2054 |
+
model_inputs["ts_sr"] = None
|
| 2055 |
|
| 2056 |
return model_inputs
|
| 2057 |
|
|
|
|
| 2195 |
|
| 2196 |
__all__ = [
|
| 2197 |
"InternS1ProVisionModel",
|
| 2198 |
+
"InternS1ProTimeSeriesModel",
|
| 2199 |
"InternS1ProForConditionalGeneration",
|
| 2200 |
"InternS1ProModel",
|
| 2201 |
"InternS1ProPreTrainedModel",
|
processing_interns1_pro.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
| 18 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
# See the License for the specific language governing permissions and
|
| 20 |
# limitations under the License.
|
| 21 |
-
from typing import Union
|
| 22 |
|
| 23 |
import numpy as np
|
| 24 |
|
|
@@ -28,6 +28,7 @@ from transformers.processing_utils import MultiModalData, ProcessingKwargs, Proc
|
|
| 28 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 29 |
from transformers.utils import logging
|
| 30 |
from transformers.video_utils import VideoInput
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
logger = logging.get_logger(__name__)
|
|
@@ -41,6 +42,7 @@ class InternS1ProProcessorKwargs(ProcessingKwargs, total=False):
|
|
| 41 |
"return_mm_token_type_ids": False,
|
| 42 |
},
|
| 43 |
"videos_kwargs": {"return_metadata": True},
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
|
|
@@ -68,6 +70,7 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 68 |
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
|
| 69 |
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 70 |
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
|
|
|
| 71 |
self.image_token_id = (
|
| 72 |
tokenizer.image_token_id
|
| 73 |
if getattr(tokenizer, "image_token_id", None)
|
|
@@ -78,6 +81,11 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 78 |
if getattr(tokenizer, "video_token_id", None)
|
| 79 |
else tokenizer.convert_tokens_to_ids(self.video_token)
|
| 80 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
| 82 |
self.vision_start_token = (
|
| 83 |
"<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
|
|
@@ -95,12 +103,132 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 95 |
if getattr(tokenizer, "vision_end_token_id", None)
|
| 96 |
else tokenizer.convert_tokens_to_ids(self.vision_end_token)
|
| 97 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
def __call__(
|
| 100 |
self,
|
| 101 |
images: ImageInput = None,
|
| 102 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 103 |
videos: VideoInput = None,
|
|
|
|
|
|
|
| 104 |
**kwargs: Unpack[InternS1ProProcessorKwargs],
|
| 105 |
) -> BatchFeature:
|
| 106 |
"""
|
|
@@ -120,6 +248,7 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 120 |
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
| 121 |
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
| 122 |
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
|
|
|
| 123 |
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 124 |
If set, will return tensors of a particular framework. Acceptable values are:
|
| 125 |
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
@@ -216,6 +345,22 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 216 |
|
| 217 |
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 220 |
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 221 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
@@ -227,7 +372,7 @@ class InternS1ProProcessor(ProcessorMixin):
|
|
| 227 |
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 228 |
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 229 |
|
| 230 |
-
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
|
| 231 |
|
| 232 |
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
| 233 |
"""
|
|
|
|
| 18 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 19 |
# See the License for the specific language governing permissions and
|
| 20 |
# limitations under the License.
|
| 21 |
+
from typing import Union,Optional
|
| 22 |
|
| 23 |
import numpy as np
|
| 24 |
|
|
|
|
| 28 |
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
| 29 |
from transformers.utils import logging
|
| 30 |
from transformers.video_utils import VideoInput
|
| 31 |
+
import os
|
| 32 |
|
| 33 |
|
| 34 |
logger = logging.get_logger(__name__)
|
|
|
|
| 42 |
"return_mm_token_type_ids": False,
|
| 43 |
},
|
| 44 |
"videos_kwargs": {"return_metadata": True},
|
| 45 |
+
"time_series_kwargs": {},
|
| 46 |
}
|
| 47 |
|
| 48 |
|
|
|
|
| 70 |
def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
|
| 71 |
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
|
| 72 |
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
|
| 73 |
+
self.ts_token = "<TS_CONTEXT>" if not hasattr(tokenizer, "ts_token") else tokenizer.ts_token
|
| 74 |
self.image_token_id = (
|
| 75 |
tokenizer.image_token_id
|
| 76 |
if getattr(tokenizer, "image_token_id", None)
|
|
|
|
| 81 |
if getattr(tokenizer, "video_token_id", None)
|
| 82 |
else tokenizer.convert_tokens_to_ids(self.video_token)
|
| 83 |
)
|
| 84 |
+
self.ts_token_id = (
|
| 85 |
+
tokenizer.ts_token_id
|
| 86 |
+
if getattr(tokenizer, "ts_token_id", None)
|
| 87 |
+
else tokenizer.convert_tokens_to_ids(self.ts_token)
|
| 88 |
+
)
|
| 89 |
super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
|
| 90 |
self.vision_start_token = (
|
| 91 |
"<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
|
|
|
|
| 103 |
if getattr(tokenizer, "vision_end_token_id", None)
|
| 104 |
else tokenizer.convert_tokens_to_ids(self.vision_end_token)
|
| 105 |
)
|
| 106 |
+
self.ts_start_token = (
|
| 107 |
+
"<|ts|>" if not hasattr(tokenizer, "ts_start_token") else tokenizer.ts_start_token
|
| 108 |
+
)
|
| 109 |
+
self.ts_end_token = (
|
| 110 |
+
"<|/ts|>" if not hasattr(tokenizer, "ts_end_token") else tokenizer.ts_end_token
|
| 111 |
+
)
|
| 112 |
+
self.ts_start_token_id = (
|
| 113 |
+
tokenizer.ts_start_token_id
|
| 114 |
+
if getattr(tokenizer, "ts_start_token_id", None)
|
| 115 |
+
else tokenizer.convert_tokens_to_ids(self.ts_start_token)
|
| 116 |
+
)
|
| 117 |
+
self.ts_end_token_id = (
|
| 118 |
+
tokenizer.ts_end_token_id
|
| 119 |
+
if getattr(tokenizer, "ts_end_token_id", None)
|
| 120 |
+
else tokenizer.convert_tokens_to_ids(self.ts_end_token)
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
def time_series_preprocessor(self,conversation):
|
| 124 |
+
if isinstance(conversation, (list, tuple)) and (
|
| 125 |
+
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
|
| 126 |
+
):
|
| 127 |
+
conversations = conversation
|
| 128 |
+
else:
|
| 129 |
+
conversations = [conversation]
|
| 130 |
+
|
| 131 |
+
batch_time_series = []
|
| 132 |
+
batch_time_series_metadata = []
|
| 133 |
+
for conversation in conversations:
|
| 134 |
+
for message in conversation:
|
| 135 |
+
if message['role'] != "user": continue
|
| 136 |
+
time_series_fnames = [
|
| 137 |
+
content["data"]
|
| 138 |
+
for content in message["content"]
|
| 139 |
+
if content.get("type") == "time_series" and "data" in content
|
| 140 |
+
]
|
| 141 |
+
time_series_rates = [
|
| 142 |
+
content.get("sampling_rate", None)
|
| 143 |
+
for content in message["content"]
|
| 144 |
+
if content.get("type") == "time_series"
|
| 145 |
+
]
|
| 146 |
+
for path, rate in zip(time_series_fnames, time_series_rates):
|
| 147 |
+
batch_time_series.append(path)
|
| 148 |
+
batch_time_series_metadata.append(rate)
|
| 149 |
+
|
| 150 |
+
return {"time_series_paths": batch_time_series if batch_time_series else None,
|
| 151 |
+
"time_series_sampling_rates": batch_time_series_metadata if batch_time_series_metadata else None}
|
| 152 |
+
|
| 153 |
+
def time_series_processor(self,
|
| 154 |
+
ts_paths: list[str],
|
| 155 |
+
sampling_rates: list[float],
|
| 156 |
+
do_normalize=True,
|
| 157 |
+
do_truncate=True,
|
| 158 |
+
|
| 159 |
+
)-> BatchFeature:
|
| 160 |
+
assert len(ts_paths)==len(sampling_rates), "ts_paths and sampling_rates must have the same length"
|
| 161 |
+
|
| 162 |
+
ts_values=[]
|
| 163 |
+
ts_sr=[]
|
| 164 |
+
ts_lens=[]
|
| 165 |
+
|
| 166 |
+
for idx,ts_path in enumerate(ts_paths):
|
| 167 |
+
sr=sampling_rates[idx]
|
| 168 |
+
ext = os.path.splitext(ts_path)[-1].lower()
|
| 169 |
+
if ext in [".wav",'.mp3','.flac']:
|
| 170 |
+
try:
|
| 171 |
+
import soundfile as sf
|
| 172 |
+
except ImportError:
|
| 173 |
+
raise ImportError("Please install soundfile to process audio files.")
|
| 174 |
+
ts_input, sr = sf.read(ts_path) # ts_input: np.ndarray, shape [T] or [T, C]
|
| 175 |
+
elif ext == ".csv":
|
| 176 |
+
pd = __import__("pandas")
|
| 177 |
+
df = pd.read_csv(ts_path, header=None)
|
| 178 |
+
ts_input = df.values # [T, C]
|
| 179 |
+
elif ext == ".npy":
|
| 180 |
+
ts_input = np.load(ts_path) # [T, C]
|
| 181 |
+
else:
|
| 182 |
+
raise ValueError(f"Unsupported file format: {ext}")
|
| 183 |
+
|
| 184 |
+
# ts_tensor = torch.from_numpy(ts_input).float()
|
| 185 |
+
if not isinstance(ts_input, np.ndarray):
|
| 186 |
+
ts_input = np.array(ts_input, dtype=np.float32)
|
| 187 |
+
|
| 188 |
+
if do_normalize:
|
| 189 |
+
mean = ts_input.mean(axis=0, keepdims=True)
|
| 190 |
+
std = ts_input.std(axis=0, keepdims=True)
|
| 191 |
+
ts_input = (ts_input - mean) / (std + 1e-8)
|
| 192 |
+
|
| 193 |
+
if do_truncate and len(ts_input)>240000:
|
| 194 |
+
ts_input=ts_input[:240000] # truncate to 240k to avoid oom
|
| 195 |
+
|
| 196 |
+
if ts_input.ndim==1:
|
| 197 |
+
ts_input=ts_input[:, None] #[T,C]
|
| 198 |
+
|
| 199 |
+
ts_len=ts_input.shape[0]
|
| 200 |
+
|
| 201 |
+
if sr is None or sr == 0: # if no sr provided
|
| 202 |
+
sr = ts_len/4
|
| 203 |
+
|
| 204 |
+
ts_values.append(ts_input)
|
| 205 |
+
ts_sr.append(sr)
|
| 206 |
+
ts_lens.append(ts_len)
|
| 207 |
+
|
| 208 |
+
ts_lens = np.array(ts_lens)
|
| 209 |
+
ts_sr = np.array(ts_sr)
|
| 210 |
+
num_ts_tokens = self._get_num_ts_tokens(sampling_rates=ts_sr,
|
| 211 |
+
ts_lens=ts_lens)
|
| 212 |
+
return BatchFeature(data={"ts_values": ts_values,
|
| 213 |
+
"ts_sr":ts_sr,
|
| 214 |
+
"ts_lens":ts_lens,
|
| 215 |
+
"num_ts_tokens":num_ts_tokens}
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
def _get_num_ts_tokens(self,sampling_rates,ts_lens):
|
| 219 |
+
strides = np.floor(160/((1+np.exp(-sampling_rates/100))**6))
|
| 220 |
+
patch_sizes = strides * 2
|
| 221 |
+
embed_lengths = (np.ceil((ts_lens - patch_sizes) / strides) + 1).astype(np.int64)
|
| 222 |
+
num_ts_tokens=[(embed_length // 2 + 1) // 2 for embed_length in embed_lengths]
|
| 223 |
+
return num_ts_tokens
|
| 224 |
|
| 225 |
def __call__(
|
| 226 |
self,
|
| 227 |
images: ImageInput = None,
|
| 228 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 229 |
videos: VideoInput = None,
|
| 230 |
+
time_series_paths: Optional[list[str]]=None,
|
| 231 |
+
time_series_sampling_rates: Optional[list[float]]=None,
|
| 232 |
**kwargs: Unpack[InternS1ProProcessorKwargs],
|
| 233 |
) -> BatchFeature:
|
| 234 |
"""
|
|
|
|
| 248 |
videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
|
| 249 |
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
|
| 250 |
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
|
| 251 |
+
time_series_signals (`list[np.ndarray]`, `list[torch.Tensor]`):
|
| 252 |
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
| 253 |
If set, will return tensors of a particular framework. Acceptable values are:
|
| 254 |
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
|
|
|
| 345 |
|
| 346 |
text[i] = text[i].replace("<|placeholder|>", self.video_token)
|
| 347 |
|
| 348 |
+
time_series_inputs = {}
|
| 349 |
+
if images is None and videos is None and time_series_paths is not None:
|
| 350 |
+
assert time_series_sampling_rates is not None, "If time_series_signals is provided, time_series_sampling_rates must also be provided."
|
| 351 |
+
assert len(time_series_paths) == len(time_series_sampling_rates), "The number of time series signals must match the number of sampling rates."
|
| 352 |
+
time_series_inputs = self.time_series_processor(ts_paths=time_series_paths, sampling_rates=time_series_sampling_rates)
|
| 353 |
+
num_ts_tokens = time_series_inputs.pop("num_ts_tokens")
|
| 354 |
+
assert len(num_ts_tokens) == len(text), "The number of time series signals must match the number of text prompts."
|
| 355 |
+
for i in range(len(text)):
|
| 356 |
+
if f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}" in text[i]:
|
| 357 |
+
ts_placeholder = self.ts_start_token + self.ts_token * num_ts_tokens[i] + self.ts_end_token
|
| 358 |
+
text[i] = text[i].replace(
|
| 359 |
+
f"{self.ts_start_token}{self.ts_token}{self.ts_end_token}", ts_placeholder, 1
|
| 360 |
+
)
|
| 361 |
+
elif self.ts_token in text[i]:
|
| 362 |
+
text[i] = text[i].replace(self.ts_token, self.ts_token * num_ts_tokens[i])
|
| 363 |
+
|
| 364 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 365 |
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 366 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
|
|
|
| 372 |
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 373 |
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 374 |
|
| 375 |
+
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs,**time_series_inputs}, tensor_type=return_tensors)
|
| 376 |
|
| 377 |
def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
|
| 378 |
"""
|
test_inference_ts.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import torch
|
| 3 |
+
from transformers import AutoConfig, AutoModelForCausalLM, AutoProcessor
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
model_path = Path(__file__).parent.resolve()
|
| 7 |
+
print(f"Loading model from: {model_path}")
|
| 8 |
+
|
| 9 |
+
# 加载模型配置
|
| 10 |
+
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
| 11 |
+
print(f"Model config: {config.model_type}")
|
| 12 |
+
print(f"Architecture: {config.architectures}")
|
| 13 |
+
|
| 14 |
+
# 加载处理器(tokenizer + image processor + ts processor)
|
| 15 |
+
print("\nLoading processor...")
|
| 16 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 17 |
+
|
| 18 |
+
# 加载模型(使用 bfloat16 精度和自动设备映射)
|
| 19 |
+
print("\nLoading model...")
|
| 20 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 21 |
+
model_path,
|
| 22 |
+
dtype=torch.bfloat16,
|
| 23 |
+
device_map="auto",
|
| 24 |
+
# attn_implementation="flash_attention_2", #时序暂不支持flash_attn,load加这行会报错
|
| 25 |
+
trust_remote_code=True
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
print(f"✓ Model loaded successfully!")
|
| 29 |
+
print(f"Model type: {type(model).__name__}")
|
| 30 |
+
print(f"Model device: {model.device}")
|
| 31 |
+
|
| 32 |
+
# ============================================================================
|
| 33 |
+
# 测试 3: 时序对话
|
| 34 |
+
# ============================================================================
|
| 35 |
+
print("\n" + "=" * 80)
|
| 36 |
+
print("测试 3: 时序对话")
|
| 37 |
+
print("=" * 80)
|
| 38 |
+
|
| 39 |
+
messages = [
|
| 40 |
+
{
|
| 41 |
+
"role": "user",
|
| 42 |
+
"content": [
|
| 43 |
+
{"type": "time_series", "data": "./0092638_seism.npy", "sampling_rate": 100},
|
| 44 |
+
{"type": "text", "text": "Please determine whether an Earthquake event has occurred in the provided time-series data. If so, please specify the starting time point indices of the P-wave and S-wave in the event."},
|
| 45 |
+
],
|
| 46 |
+
}
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
time_series_inputs = processor.time_series_preprocessor(messages)
|
| 50 |
+
multimodal_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", enable_thinking=False, **time_series_inputs).to(model.device, dtype=torch.bfloat16)
|
| 51 |
+
|
| 52 |
+
print("\n生成时序回复...")
|
| 53 |
+
with torch.inference_mode():
|
| 54 |
+
multimodal_generated_ids = model.generate(
|
| 55 |
+
**multimodal_inputs,
|
| 56 |
+
max_new_tokens=200,
|
| 57 |
+
do_sample=False,
|
| 58 |
+
temperature=1.0,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# 提取生成的 token(去除输入部分)
|
| 62 |
+
multimodal_generated_ids_trimmed = [
|
| 63 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(multimodal_inputs.input_ids, multimodal_generated_ids)
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
# 解码为文本
|
| 67 |
+
multimodal_output = processor.batch_decode(
|
| 68 |
+
multimodal_generated_ids_trimmed,
|
| 69 |
+
skip_special_tokens=True,
|
| 70 |
+
clean_up_tokenization_spaces=False
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
print("\n" + "-" * 80)
|
| 74 |
+
print("时序输出:")
|
| 75 |
+
print("-" * 80)
|
| 76 |
+
print(multimodal_output[0])
|
| 77 |
+
print("-" * 80)
|
| 78 |
+
print("\n✅ 时序功能测试完成!")
|